feat(backend): Phase 1-4 全新開發完成,37/37 TDD 通過

[Phase 0 Reset]
- 清除舊版 app/、alembic/versions/、雜亂測試腳本
- 新 requirements.txt (移除 caldav/redis/keycloak-lib,加入 apscheduler/croniter/docker/paramiko/ping3/dnspython)

[Phase 1 資料庫]
- 9 張資料表 SQLAlchemy Models:tenants / accounts / schedules / schedule_logs /
  tenant_schedule_results / account_schedule_results / servers / server_status_logs / system_status_logs
- Alembic migration 001_create_all_tables (已套用到 10.1.0.20:5433/virtual_mis)
- seed.py:schedules 初始 3 筆 / servers 初始 4 筆

[Phase 2 CRUD API]
- GET/POST/PUT/DELETE: /api/v1/tenants / accounts / servers / schedules
- /api/v1/system-status
- 帳號編碼自動產生 (prefix + seq_no 4碼左補0)
- 燈號 (lights) 從最新排程結果取得

[Phase 3 Watchdog]
- APScheduler interval 3分鐘,原子 UPDATE status=Going 防重複執行
- 手動觸發 API: POST /api/v1/schedules/{id}/run

[Phase 4 Service Clients]
- KeycloakClient:vmis-admin realm,REST API (不用 python-keycloak)
- MailClient:Docker Mailserver @ 10.1.0.254:8080,含 MX DNS 驗證
- DockerClient:docker-py 本機 + paramiko SSH 遠端 compose
- NextcloudClient:OCS API user/quota
- SystemChecker:功能驗證 (traefik routers>0 / keycloak token / SMTP EHLO / DB SELECT 1 / ping)

[TDD]
- 37 tests / 37 passed (2.11s)
- SQLite in-memory + StaticPool,無需外部 DB

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
VMIS Developer
2026-03-14 13:10:15 +08:00
parent 22611f7f73
commit 42d1420f9c
52 changed files with 2934 additions and 0 deletions

View File

View File

@@ -0,0 +1,86 @@
"""
DockerClient — docker-py (本機 Docker socket) + paramiko SSH (遠端 docker compose)
管理租戶的 NC / OO 容器。
"""
import logging
from typing import Optional
import httpx
from app.core.config import settings
logger = logging.getLogger(__name__)
class DockerClient:
def __init__(self):
self._docker = None
def _get_docker(self):
if self._docker is None:
import docker
self._docker = docker.from_env()
return self._docker
def check_traefik_route(self, domain: str) -> bool:
"""
Traefik API: GET http://localhost:8080/api/http/routers
驗證 routers 中包含 domain且 routers 數量 > 0
"""
try:
resp = httpx.get("http://localhost:8080/api/overview", timeout=5.0)
if resp.status_code != 200:
return False
data = resp.json()
# Verify actual routes exist (functional check)
http_count = data.get("http", {}).get("routers", {}).get("total", 0)
if http_count == 0:
return False
# Check domain-specific router
routers_resp = httpx.get("http://localhost:8080/api/http/routers", timeout=5.0)
if routers_resp.status_code != 200:
return False
routers = routers_resp.json()
return any(domain in str(r.get("rule", "")) for r in routers)
except Exception as e:
logger.warning(f"Traefik check failed for {domain}: {e}")
return False
def ensure_container_running(self, container_name: str, tenant_code: str, realm: str) -> bool:
"""Check container status; start if exited; deploy via SSH if not found."""
try:
docker_client = self._get_docker()
container = docker_client.containers.get(container_name)
if container.status == "running":
return True
elif container.status == "exited":
container.start()
container.reload()
return container.status == "running"
except Exception as e:
if "Not Found" in str(e) or "404" in str(e):
return self._ssh_compose_up(tenant_code, realm)
logger.error(f"Docker check failed for {container_name}: {e}")
return False
return False
def _ssh_compose_up(self, tenant_code: str, realm: str) -> bool:
"""SSH into 10.1.0.254 and run docker compose up -d"""
try:
import paramiko
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(
settings.DOCKER_SSH_HOST,
username=settings.DOCKER_SSH_USER,
timeout=15,
)
deploy_dir = f"{settings.TENANT_DEPLOY_BASE}/{tenant_code}"
stdin, stdout, stderr = client.exec_command(
f"cd {deploy_dir} && docker compose up -d 2>&1"
)
exit_status = stdout.channel.recv_exit_status()
client.close()
return exit_status == 0
except Exception as e:
logger.error(f"SSH compose up failed for {tenant_code}: {e}")
return False

View File

@@ -0,0 +1,95 @@
"""
KeycloakClient — 直接呼叫 Keycloak REST API不使用 python-keycloak 套件。
管理租戶 realm 及帳號的建立/查詢。
"""
import logging
from typing import Optional
import httpx
from app.core.config import settings
logger = logging.getLogger(__name__)
TIMEOUT = 10.0
class KeycloakClient:
def __init__(self):
self._base = settings.KEYCLOAK_URL.rstrip("/")
self._admin_token: Optional[str] = None
def _get_admin_token(self) -> str:
"""取得 vmis-admin realm 的 admin access token"""
url = f"{self._base}/realms/{settings.KEYCLOAK_ADMIN_REALM}/protocol/openid-connect/token"
resp = httpx.post(
url,
data={
"grant_type": "client_credentials",
"client_id": settings.KEYCLOAK_ADMIN_CLIENT_ID,
"client_secret": settings.KEYCLOAK_ADMIN_CLIENT_SECRET,
},
timeout=TIMEOUT,
)
resp.raise_for_status()
return resp.json()["access_token"]
def _headers(self) -> dict:
if not self._admin_token:
self._admin_token = self._get_admin_token()
return {"Authorization": f"Bearer {self._admin_token}"}
def _admin_url(self, path: str) -> str:
return f"{self._base}/admin/realms/{path}"
def realm_exists(self, realm: str) -> bool:
try:
resp = httpx.get(self._admin_url(realm), headers=self._headers(), timeout=TIMEOUT)
return resp.status_code == 200
except Exception:
return False
def create_realm(self, realm: str, display_name: str) -> bool:
payload = {
"realm": realm,
"displayName": display_name,
"enabled": True,
"loginTheme": "keycloak",
}
resp = httpx.post(
f"{self._base}/admin/realms",
json=payload,
headers=self._headers(),
timeout=TIMEOUT,
)
return resp.status_code in (201, 204)
def get_user_uuid(self, realm: str, username: str) -> Optional[str]:
resp = httpx.get(
self._admin_url(f"{realm}/users"),
params={"username": username, "exact": "true"},
headers=self._headers(),
timeout=TIMEOUT,
)
resp.raise_for_status()
users = resp.json()
return users[0]["id"] if users else None
def create_user(self, realm: str, username: str, email: str, password: Optional[str]) -> Optional[str]:
payload = {
"username": username,
"email": email,
"enabled": True,
"emailVerified": True,
}
if password:
payload["credentials"] = [{"type": "password", "value": password, "temporary": True}]
resp = httpx.post(
self._admin_url(f"{realm}/users"),
json=payload,
headers=self._headers(),
timeout=TIMEOUT,
)
if resp.status_code == 201:
location = resp.headers.get("Location", "")
return location.rstrip("/").split("/")[-1]
return None

View File

@@ -0,0 +1,81 @@
"""
MailClient — 呼叫 Docker Mailserver Admin API (http://10.1.0.254:8080)
管理 mail domain 和 mailbox 的建立/查詢。
建立 domain 前必須驗證 MX DNS 設定(對 active 租戶)。
"""
import logging
from typing import Optional
import httpx
import dns.resolver
from app.core.config import settings
logger = logging.getLogger(__name__)
TIMEOUT = 10.0
class MailClient:
def __init__(self):
self._base = settings.MAIL_ADMIN_API_URL.rstrip("/")
self._headers = {"X-API-Key": settings.MAIL_ADMIN_API_KEY}
def check_mx_dns(self, domain: str) -> bool:
"""驗證 domain 的 MX record 是否指向正確的 mail server"""
try:
answers = dns.resolver.resolve(domain, "MX")
for rdata in answers:
if settings.MAIL_MX_HOST in str(rdata.exchange).rstrip("."):
return True
return False
except Exception as e:
logger.warning(f"MX DNS check failed for {domain}: {e}")
return False
def domain_exists(self, domain: str) -> bool:
try:
resp = httpx.get(
f"{self._base}/api/v1/domains/{domain}",
headers=self._headers,
timeout=TIMEOUT,
)
return resp.status_code == 200
except Exception:
return False
def create_domain(self, domain: str) -> bool:
try:
resp = httpx.post(
f"{self._base}/api/v1/domains",
json={"domain": domain},
headers=self._headers,
timeout=TIMEOUT,
)
return resp.status_code in (200, 201, 204)
except Exception as e:
logger.error(f"create_domain({domain}) failed: {e}")
return False
def mailbox_exists(self, email: str) -> bool:
try:
resp = httpx.get(
f"{self._base}/api/v1/mailboxes/{email}",
headers=self._headers,
timeout=TIMEOUT,
)
return resp.status_code == 200
except Exception:
return False
def create_mailbox(self, email: str, password: Optional[str], quota_gb: int = 20) -> bool:
try:
resp = httpx.post(
f"{self._base}/api/v1/mailboxes",
json={"email": email, "password": password or "", "quota": quota_gb},
headers=self._headers,
timeout=TIMEOUT,
)
return resp.status_code in (200, 201, 204)
except Exception as e:
logger.error(f"create_mailbox({email}) failed: {e}")
return False

View File

@@ -0,0 +1,85 @@
"""
NextcloudClient — Nextcloud OCS API
管理 NC 使用者的查詢/建立與 quota 統計。
"""
import logging
from typing import Optional
import httpx
logger = logging.getLogger(__name__)
OCS_HEADERS = {"OCS-APIRequest": "true"}
TIMEOUT = 15.0
class NextcloudClient:
def __init__(self, domain: str, admin_user: str = "admin", admin_password: str = ""):
self._base = f"https://{domain}"
self._auth = (admin_user, admin_password)
def user_exists(self, username: str) -> bool:
try:
resp = httpx.get(
f"{self._base}/ocs/v1.php/cloud/users/{username}",
auth=self._auth,
headers=OCS_HEADERS,
timeout=TIMEOUT,
)
return resp.status_code == 200
except Exception:
return False
def create_user(self, username: str, password: Optional[str], quota_gb: int = 20) -> bool:
try:
resp = httpx.post(
f"{self._base}/ocs/v1.php/cloud/users",
auth=self._auth,
headers=OCS_HEADERS,
data={
"userid": username,
"password": password or "",
"quota": f"{quota_gb}GB",
},
timeout=TIMEOUT,
)
return resp.status_code == 200
except Exception as e:
logger.error(f"NC create_user({username}) failed: {e}")
return False
def get_user_quota_used_gb(self, username: str) -> Optional[float]:
try:
resp = httpx.get(
f"{self._base}/ocs/v2.php/cloud/users/{username}",
auth=self._auth,
headers=OCS_HEADERS,
timeout=TIMEOUT,
)
if resp.status_code != 200:
return None
used_bytes = resp.json().get("ocs", {}).get("data", {}).get("quota", {}).get("used", 0)
return round(used_bytes / 1073741824, 4)
except Exception:
return None
def get_total_quota_used_gb(self) -> Optional[float]:
"""Sum all users' quota usage"""
try:
resp = httpx.get(
f"{self._base}/ocs/v2.php/cloud/users",
auth=self._auth,
headers=OCS_HEADERS,
params={"limit": 500},
timeout=TIMEOUT,
)
if resp.status_code != 200:
return None
users = resp.json().get("ocs", {}).get("data", {}).get("users", [])
total = 0.0
for uid in users:
used = self.get_user_quota_used_gb(uid)
if used:
total += used
return round(total, 4)
except Exception:
return None

View File

@@ -0,0 +1,59 @@
"""
Schedule dispatcher: routes schedule_id to the correct run function.
Also used by manual trigger API.
"""
import logging
from sqlalchemy.orm import Session
from app.core.database import SessionLocal
logger = logging.getLogger(__name__)
def dispatch_schedule(schedule_id: int, log_id: int = None, db: Session = None):
"""
Dispatch to correct schedule function.
When called from watchdog, db and log_id are provided.
When called from manual API, creates its own session and log.
"""
own_db = db is None
if own_db:
db = SessionLocal()
if log_id is None:
from datetime import datetime
from app.models.schedule import ScheduleLog, Schedule
schedule = db.get(Schedule, schedule_id)
if not schedule:
if own_db:
db.close()
return
log = ScheduleLog(
schedule_id=schedule_id,
schedule_name=schedule.name,
started_at=datetime.utcnow(),
status="running",
)
db.add(log)
db.commit()
db.refresh(log)
log_id = log.id
try:
if schedule_id == 1:
from app.services.scheduler.schedule_tenant import run_tenant_check
run_tenant_check(log_id, db)
elif schedule_id == 2:
from app.services.scheduler.schedule_account import run_account_check
run_account_check(log_id, db)
elif schedule_id == 3:
from app.services.scheduler.schedule_system import run_system_status
run_system_status(log_id, db)
else:
logger.warning(f"Unknown schedule_id: {schedule_id}")
except Exception as e:
logger.exception(f"dispatch_schedule({schedule_id}) error: {e}")
raise
finally:
if own_db:
db.close()

View File

@@ -0,0 +1,103 @@
"""
Schedule 2 — 帳號檢查(每 3 分鐘)
檢查每個 active 帳號的: SSO使用者 / Mailbox / NC使用者 / Quota
"""
import logging
from datetime import datetime
from sqlalchemy.orm import Session
from app.models.account import Account
from app.models.result import AccountScheduleResult
logger = logging.getLogger(__name__)
def run_account_check(schedule_log_id: int, db: Session):
from app.services.keycloak_client import KeycloakClient
from app.services.mail_client import MailClient
from app.services.nextcloud_client import NextcloudClient
accounts = (
db.query(Account)
.filter(Account.is_active == True)
.all()
)
kc = KeycloakClient()
mail = MailClient()
for account in accounts:
tenant = account.tenant
realm = tenant.keycloak_realm or tenant.code
result = AccountScheduleResult(
schedule_log_id=schedule_log_id,
account_id=account.id,
sso_account=account.sso_account,
recorded_at=datetime.utcnow(),
)
fail_reasons = []
# [1] SSO user check
try:
sso_uuid = kc.get_user_uuid(realm, account.sso_account)
if sso_uuid:
result.sso_result = True
result.sso_uuid = sso_uuid
if not account.sso_uuid:
account.sso_uuid = sso_uuid
else:
sso_uuid = kc.create_user(realm, account.sso_account, account.email, account.default_password)
result.sso_result = sso_uuid is not None
result.sso_uuid = sso_uuid
if sso_uuid and not account.sso_uuid:
account.sso_uuid = sso_uuid
result.sso_done_at = datetime.utcnow()
except Exception as e:
result.sso_result = False
result.sso_done_at = datetime.utcnow()
fail_reasons.append(f"sso: {e}")
# [2] Mailbox check (skip if mail domain not ready)
try:
email = account.email or f"{account.sso_account}@{tenant.domain}"
mb_exists = mail.mailbox_exists(email)
if mb_exists:
result.mailbox_result = True
else:
created = mail.create_mailbox(email, account.default_password, account.quota_limit)
result.mailbox_result = created
result.mailbox_done_at = datetime.utcnow()
except Exception as e:
result.mailbox_result = False
result.mailbox_done_at = datetime.utcnow()
fail_reasons.append(f"mailbox: {e}")
# [3] NC user check
try:
nc = NextcloudClient(tenant.domain)
nc_exists = nc.user_exists(account.sso_account)
if nc_exists:
result.nc_result = True
else:
created = nc.create_user(account.sso_account, account.default_password, account.quota_limit)
result.nc_result = created
result.nc_done_at = datetime.utcnow()
except Exception as e:
result.nc_result = False
result.nc_done_at = datetime.utcnow()
fail_reasons.append(f"nc: {e}")
# [4] Quota
try:
nc = NextcloudClient(tenant.domain)
result.quota_usage = nc.get_user_quota_used_gb(account.sso_account)
except Exception as e:
logger.warning(f"Quota check failed for {account.account_code}: {e}")
if fail_reasons:
result.fail_reason = "; ".join(fail_reasons)
db.add(result)
db.commit()
db.flush()
logger.info(f"Account check done: {len(accounts)} accounts processed")

View File

@@ -0,0 +1,94 @@
"""
Schedule 3 — 系統狀態(每日 08:00
Part A: 基礎設施服務功能驗證traefik/keycloak/mail/db
Part B: 伺服器 ping 檢查
"""
import logging
from datetime import datetime
from sqlalchemy.orm import Session
from app.models.server import SystemStatusLog, ServerStatusLog, Server
logger = logging.getLogger(__name__)
# Fixed 8 services: environment × service_name
SERVICES = [
{"environment": "test", "service_name": "traefik",
"service_desc": "測試環境反向代理", "host": "localhost", "port": 8080},
{"environment": "test", "service_name": "keycloak",
"service_desc": "測試環境 SSO",
"url": "https://auth.lab.taipei", "realm": "master"},
{"environment": "test", "service_name": "mail",
"service_desc": "測試環境 Mail Server", "host": "localhost", "port": 587},
{"environment": "test", "service_name": "db",
"service_desc": "10.1.0.20:5433 PostgreSQL",
"db_host": "10.1.0.20", "db_port": 5433},
{"environment": "prod", "service_name": "traefik",
"service_desc": "正式環境反向代理", "host": "localhost", "port": 8080},
{"environment": "prod", "service_name": "keycloak",
"service_desc": "正式環境 SSO",
"url": "https://auth.ease.taipei", "realm": "master"},
{"environment": "prod", "service_name": "mail",
"service_desc": "正式環境 Mail Server", "host": "10.1.0.254", "port": 587},
{"environment": "prod", "service_name": "db",
"service_desc": "10.1.0.254:5432 PostgreSQL",
"db_host": "10.1.0.254", "db_port": 5432},
]
def run_system_status(schedule_log_id: int, db: Session):
from app.services.system_checker import SystemChecker
checker = SystemChecker()
# Part A: Infrastructure services
for svc in SERVICES:
result = False
fail_reason = None
try:
if svc["service_name"] == "traefik":
result = checker.check_traefik(svc["host"], svc["port"])
elif svc["service_name"] == "keycloak":
result = checker.check_keycloak(svc["url"], svc["realm"])
elif svc["service_name"] == "mail":
result = checker.check_smtp(svc["host"], svc["port"])
elif svc["service_name"] == "db":
result = checker.check_postgres(svc["db_host"], svc["db_port"])
except Exception as e:
result = False
fail_reason = str(e)
db.add(SystemStatusLog(
schedule_log_id=schedule_log_id,
environment=svc["environment"],
service_name=svc["service_name"],
service_desc=svc["service_desc"],
result=result,
fail_reason=fail_reason,
recorded_at=datetime.utcnow(),
))
# Part B: Server ping
servers = db.query(Server).filter(Server.is_active == True).order_by(Server.sort_order).all()
for server in servers:
response_time = None
fail_reason = None
try:
response_time = checker.ping_server(server.ip_address)
result = response_time is not None
if not result:
fail_reason = "No response"
except Exception as e:
result = False
fail_reason = str(e)
db.add(ServerStatusLog(
schedule_log_id=schedule_log_id,
server_id=server.id,
result=result,
response_time=response_time,
fail_reason=fail_reason,
recorded_at=datetime.utcnow(),
))
db.commit()
logger.info(f"System status check done: {len(SERVICES)} services + {len(servers)} servers")

View File

@@ -0,0 +1,110 @@
"""
Schedule 1 — 租戶檢查(每 3 分鐘)
檢查每個 active 租戶的: Traefik路由 / SSO Realm / Mailbox Domain / NC容器 / OO容器 / Quota
"""
import logging
from datetime import datetime
from sqlalchemy.orm import Session
from app.models.tenant import Tenant
from app.models.result import TenantScheduleResult
logger = logging.getLogger(__name__)
def run_tenant_check(schedule_log_id: int, db: Session):
from app.services.keycloak_client import KeycloakClient
from app.services.mail_client import MailClient
from app.services.docker_client import DockerClient
from app.services.nextcloud_client import NextcloudClient
tenants = db.query(Tenant).filter(Tenant.is_active == True).all()
kc = KeycloakClient()
mail = MailClient()
docker = DockerClient()
for tenant in tenants:
realm = tenant.keycloak_realm or tenant.code
result = TenantScheduleResult(
schedule_log_id=schedule_log_id,
tenant_id=tenant.id,
recorded_at=datetime.utcnow(),
)
fail_reasons = []
# [1] Traefik
try:
result.traefik_status = docker.check_traefik_route(tenant.domain)
result.traefik_done_at = datetime.utcnow()
except Exception as e:
result.traefik_status = False
result.traefik_done_at = datetime.utcnow()
fail_reasons.append(f"traefik: {e}")
# [2] SSO
try:
exists = kc.realm_exists(realm)
if not exists:
kc.create_realm(realm, tenant.name)
result.sso_result = True
result.sso_done_at = datetime.utcnow()
except Exception as e:
result.sso_result = False
result.sso_done_at = datetime.utcnow()
fail_reasons.append(f"sso: {e}")
# [3] Mailbox Domain (with DNS check for active tenants)
try:
if tenant.status == "active":
dns_ok = mail.check_mx_dns(tenant.domain)
if not dns_ok:
result.mailbox_result = False
result.mailbox_done_at = datetime.utcnow()
fail_reasons.append("mailbox: MX record not configured")
db.add(result)
db.commit()
continue
domain_exists = mail.domain_exists(tenant.domain)
if not domain_exists:
mail.create_domain(tenant.domain)
result.mailbox_result = True
result.mailbox_done_at = datetime.utcnow()
except Exception as e:
result.mailbox_result = False
result.mailbox_done_at = datetime.utcnow()
fail_reasons.append(f"mailbox: {e}")
# [4] Nextcloud container
try:
nc_name = f"nc-{realm}"
result.nc_result = docker.ensure_container_running(nc_name, tenant.code, realm)
result.nc_done_at = datetime.utcnow()
except Exception as e:
result.nc_result = False
result.nc_done_at = datetime.utcnow()
fail_reasons.append(f"nc: {e}")
# [5] OnlyOffice container
try:
oo_name = f"oo-{realm}"
result.office_result = docker.ensure_container_running(oo_name, tenant.code, realm)
result.office_done_at = datetime.utcnow()
except Exception as e:
result.office_result = False
result.office_done_at = datetime.utcnow()
fail_reasons.append(f"office: {e}")
# [6] Quota
try:
nc = NextcloudClient(tenant.domain)
result.quota_usage = nc.get_total_quota_used_gb()
except Exception as e:
logger.warning(f"Quota check failed for {tenant.code}: {e}")
if fail_reasons:
result.fail_reason = "; ".join(fail_reasons)
db.add(result)
db.commit()
logger.info(f"Tenant check done: {len(tenants)} tenants processed")

View File

@@ -0,0 +1,107 @@
"""
Watchdog: APScheduler BackgroundScheduler每 3 分鐘掃描 schedules 表。
防重複執行:原子 UPDATE status='Going',影響 0 筆則跳過。
"""
import logging
from datetime import datetime
from apscheduler.schedulers.background import BackgroundScheduler
from croniter import croniter
from sqlalchemy import update
from sqlalchemy.orm import Session
from app.core.database import SessionLocal
from app.models.schedule import Schedule, ScheduleLog
logger = logging.getLogger(__name__)
_scheduler = BackgroundScheduler(timezone="Asia/Taipei")
def _watchdog_tick():
db: Session = SessionLocal()
try:
due = (
db.query(Schedule)
.filter(
Schedule.status == "Waiting",
Schedule.next_run_at <= datetime.utcnow(),
)
.all()
)
for schedule in due:
# Atomic lock: only one process wins
affected = db.execute(
update(Schedule)
.where(Schedule.id == schedule.id, Schedule.status == "Waiting")
.values(status="Going")
).rowcount
db.commit()
if affected == 0:
# Another process already grabbed it
continue
log = ScheduleLog(
schedule_id=schedule.id,
schedule_name=schedule.name,
started_at=datetime.utcnow(),
status="running",
)
db.add(log)
db.commit()
db.refresh(log)
try:
from app.services.scheduler.runner import dispatch_schedule
dispatch_schedule(schedule.id, log.id, db)
final_status = "ok"
except Exception as e:
logger.exception(f"Schedule {schedule.name} failed: {e}")
final_status = "error"
# Update log
log.ended_at = datetime.utcnow()
log.status = final_status
# Recalculate next_run_at
try:
cron = croniter(schedule.cron_timer, datetime.utcnow())
next_run = cron.get_next(datetime)
except Exception:
next_run = None
# Reset schedule
db.execute(
update(Schedule)
.where(Schedule.id == schedule.id)
.values(
status="Waiting",
last_run_at=datetime.utcnow(),
next_run_at=next_run,
last_status=final_status,
)
)
db.commit()
except Exception as e:
logger.exception(f"Watchdog tick error: {e}")
db.rollback()
finally:
db.close()
def start_watchdog():
_scheduler.add_job(
_watchdog_tick,
trigger="interval",
minutes=3,
id="watchdog",
replace_existing=True,
)
_scheduler.start()
logger.info("Watchdog scheduler started")
def stop_watchdog():
_scheduler.shutdown(wait=False)
logger.info("Watchdog scheduler stopped")

View File

@@ -0,0 +1,49 @@
"""Initial data seed: schedules + servers"""
from datetime import datetime
from croniter import croniter
from sqlalchemy.orm import Session
from app.models.schedule import Schedule
from app.models.server import Server
INITIAL_SCHEDULES = [
{"id": 1, "name": "租戶檢查", "cron_timer": "0 */3 * * * *"},
{"id": 2, "name": "帳號檢查", "cron_timer": "0 */3 * * * *"},
{"id": 3, "name": "系統狀態", "cron_timer": "0 0 8 * * *"},
]
INITIAL_SERVERS = [
{"id": 1, "name": "home", "ip_address": "10.1.0.254", "sort_order": 1,
"description": "核心服務主機 (Ubuntu 24.04 / Dell Inspiron 3910)"},
{"id": 2, "name": "小的NAS", "ip_address": "10.1.0.20", "sort_order": 2,
"description": "資料庫主機 (Synology DS716+II / DSM 6.2.4)"},
{"id": 3, "name": "大的NAS", "ip_address": "10.1.0.30", "sort_order": 3,
"description": "儲存主機 (Synology DS920+ / DSM 7.3.2)"},
{"id": 4, "name": "Porsche_KLI", "ip_address": "10.1.0.245", "sort_order": 4,
"description": "開發環境 (ASUS MINIPC PN62 / Windows 11)"},
]
def _calc_next_run(cron_timer: str) -> datetime:
# croniter: six-field cron (sec min hour day month weekday)
cron = croniter(cron_timer, datetime.utcnow())
return cron.get_next(datetime)
def seed_initial_data(db: Session) -> None:
"""Insert initial schedules and servers if not present."""
for s in INITIAL_SCHEDULES:
if not db.get(Schedule, s["id"]):
db.add(Schedule(
id=s["id"],
name=s["name"],
cron_timer=s["cron_timer"],
status="Waiting",
next_run_at=_calc_next_run(s["cron_timer"]),
))
for sv in INITIAL_SERVERS:
if not db.get(Server, sv["id"]):
db.add(Server(**sv))
db.commit()

View File

@@ -0,0 +1,105 @@
"""
SystemChecker — 功能驗證(不只 handshake
traefik: routers > 0 / keycloak: token 取得 / mail: EHLO / db: SELECT 1 / server: ping
"""
import logging
import smtplib
from typing import Optional
import httpx
import psycopg2
from app.core.config import settings
logger = logging.getLogger(__name__)
class SystemChecker:
def check_traefik(self, host: str = "localhost", port: int = 8080) -> bool:
"""Traefik API: overview + routers count > 0"""
try:
resp = httpx.get(f"http://{host}:{port}/api/overview", timeout=5.0)
if resp.status_code != 200:
return False
data = resp.json()
total_routers = data.get("http", {}).get("routers", {}).get("total", 0)
return total_routers > 0
except Exception as e:
logger.warning(f"Traefik check failed: {e}")
return False
def check_keycloak(self, base_url: str, realm: str = "master") -> bool:
"""
Step 1: GET /realms/master → 200
Step 2: POST /realms/master/protocol/openid-connect/token with client_credentials
"""
try:
resp = httpx.get(f"{base_url}/realms/{realm}", timeout=8.0)
if resp.status_code != 200:
return False
# Functional check: get admin token
token_resp = httpx.post(
f"{base_url}/realms/{settings.KEYCLOAK_ADMIN_REALM}/protocol/openid-connect/token",
data={
"grant_type": "client_credentials",
"client_id": settings.KEYCLOAK_ADMIN_CLIENT_ID,
"client_secret": settings.KEYCLOAK_ADMIN_CLIENT_SECRET,
},
timeout=8.0,
)
return token_resp.status_code == 200 and "access_token" in token_resp.json()
except Exception as e:
logger.warning(f"Keycloak check failed ({base_url}): {e}")
return False
def check_smtp(self, host: str, port: int = 587) -> bool:
"""SMTP connect + EHLO (functional protocol check)"""
try:
with smtplib.SMTP(host, port, timeout=8) as smtp:
smtp.ehlo()
return True
except Exception as e:
logger.warning(f"SMTP check failed ({host}:{port}): {e}")
return False
def check_postgres(self, host: str, port: int = 5432) -> bool:
"""psycopg2 connect + SELECT 1"""
try:
conn = psycopg2.connect(
host=host, port=port, dbname="postgres",
user="admin", password="DC1qaz2wsx",
connect_timeout=8,
)
cur = conn.cursor()
cur.execute("SELECT 1")
result = cur.fetchone()
conn.close()
return result == (1,)
except Exception as e:
logger.warning(f"PostgreSQL check failed ({host}:{port}): {e}")
return False
def ping_server(self, ip_address: str) -> Optional[float]:
"""
ICMP ping, returns response time in ms or None if unreachable.
Falls back to TCP port 22 if ping requires root privileges.
"""
try:
import ping3
result = ping3.ping(ip_address, timeout=3)
if result is not None and result is not False:
return round(result * 1000, 2) # convert to ms
except PermissionError:
# Fallback: TCP connect to port 22
import socket
import time
try:
start = time.time()
sock = socket.create_connection((ip_address, 22), timeout=3)
sock.close()
return round((time.time() - start) * 1000, 2)
except Exception:
pass
except Exception as e:
logger.warning(f"Ping failed for {ip_address}: {e}")
return None