1133 lines
45 KiB
Python
1133 lines
45 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
|
||
"""
|
||
增强版任务调度器模块
|
||
负责任务的高级调度和执行
|
||
实现了优先级队列、工作线程管理、任务持久化等特性
|
||
"""
|
||
|
||
import asyncio
|
||
import logging
|
||
import json
|
||
import os
|
||
import uuid
|
||
import time
|
||
import traceback
|
||
from typing import Dict, List, Any, Optional, Set, Tuple, Union
|
||
from datetime import datetime, timedelta
|
||
from sqlalchemy import select, update, func
|
||
from sqlalchemy.ext.asyncio import AsyncSession
|
||
|
||
from config.settings import settings
|
||
from data.models.taskdef import VWEDTaskDef
|
||
from data.models.taskrecord import VWEDTaskRecord
|
||
from data.session import get_async_session
|
||
from services.execution.task_executor import TaskExecutor
|
||
from services.execution.handlers.model.block_name import RobotBlockName
|
||
|
||
from services.enhanced_scheduler.priority_queue_manager import PriorityQueueManager
|
||
from services.enhanced_scheduler.worker_manager import WorkerManager
|
||
from services.enhanced_scheduler.task_persistence import TaskPersistenceManager
|
||
from services.enhanced_scheduler.periodic_task_manager import PeriodicTaskManager
|
||
from data.enum.task_record_enum import TaskStatus, SourceType
|
||
from data.enum.task_def_enum import EnableStatus, PeriodicTaskStatus, TaskStatusEnum
|
||
from utils.logger import get_logger
|
||
|
||
# 获取日志记录器
|
||
logger = get_logger("services.enhanced_scheduler.task_scheduler")
|
||
|
||
class EnhancedTaskScheduler:
|
||
"""
|
||
增强版任务调度器类
|
||
实现了高级任务调度功能
|
||
"""
|
||
|
||
_instance = None
|
||
|
||
def __new__(cls):
|
||
"""
|
||
单例模式实现
|
||
"""
|
||
if cls._instance is None:
|
||
cls._instance = super(EnhancedTaskScheduler, cls).__new__(cls)
|
||
cls._instance._initialized = False
|
||
return cls._instance
|
||
|
||
def __init__(self):
|
||
"""
|
||
初始化调度器
|
||
"""
|
||
if not self._initialized:
|
||
# 基本属性
|
||
self.running_tasks = {} # 正在运行的任务 {task_id: executor}
|
||
self.is_running = False # 调度器是否正在运行
|
||
|
||
# 配置信息
|
||
self._load_config()
|
||
|
||
# 初始化组件
|
||
self._init_components()
|
||
|
||
self._initialized = True
|
||
logger.info("增强版任务调度器初始化完成")
|
||
|
||
def _load_config(self):
|
||
"""
|
||
加载配置信息
|
||
"""
|
||
# 从配置文件加载设置
|
||
self.worker_min_count = settings.TASK_SCHEDULER_MIN_WORKER_COUNT # 最小工作线程数
|
||
self.worker_max_count = settings.TASK_SCHEDULER_MAX_WORKER_COUNT # 最大工作线程数
|
||
self.queue_count = settings.TASK_SCHEDULER_QUEUE_COUNT # 队列数量
|
||
|
||
# 队列阈值百分比配置
|
||
self.queue_threshold_percentiles = settings.TASK_SCHEDULER_QUEUE_THRESHOLD_PERCENTILES
|
||
|
||
# 工作线程分配比例
|
||
self.worker_ratios = settings.TASK_SCHEDULER_WORKER_RATIOS
|
||
|
||
# 任务超时时间(秒)
|
||
self.task_timeout = settings.TASK_SCHEDULER_TASK_TIMEOUT
|
||
|
||
# 任务重试配置
|
||
self.max_retry_count = settings.TASK_SCHEDULER_MAX_RETRY_COUNT
|
||
self.retry_delay = settings.TASK_SCHEDULER_RETRY_DELAY
|
||
|
||
# 备份配置
|
||
self.backup_interval = settings.TASK_SCHEDULER_BACKUP_INTERVAL
|
||
self.backup_dir = settings.TASK_SCHEDULER_BACKUP_DIR
|
||
self.max_backups = settings.TASK_SCHEDULER_MAX_BACKUPS
|
||
|
||
# 自动扩缩容配置
|
||
self.cpu_threshold = settings.TASK_SCHEDULER_CPU_THRESHOLD
|
||
self.memory_threshold = settings.TASK_SCHEDULER_MEMORY_THRESHOLD
|
||
self.auto_scale_interval = settings.TASK_SCHEDULER_AUTO_SCALE_INTERVAL
|
||
self.worker_heartbeat_interval = settings.TASK_SCHEDULER_WORKER_HEARTBEAT_INTERVAL
|
||
|
||
|
||
logger.info(f"加载任务调度器配置: 工作线程数={self.worker_min_count}-{self.worker_max_count}, "
|
||
f"队列数={self.queue_count}, 任务超时={self.task_timeout}秒")
|
||
|
||
def _init_components(self):
|
||
"""
|
||
初始化组件
|
||
"""
|
||
# 初始化优先级队列管理器
|
||
self.queue_manager = PriorityQueueManager(
|
||
queue_count=self.queue_count,
|
||
threshold_percentiles=self.queue_threshold_percentiles,
|
||
worker_ratios=self.worker_ratios
|
||
)
|
||
|
||
# 初始化工作线程管理器
|
||
self.worker_manager = WorkerManager(
|
||
min_workers=self.worker_min_count,
|
||
max_workers=self.worker_max_count,
|
||
cpu_threshold=self.cpu_threshold,
|
||
memory_threshold=self.memory_threshold,
|
||
auto_scale_interval=self.auto_scale_interval,
|
||
worker_heartbeat_interval=self.worker_heartbeat_interval,
|
||
queue_manager=self.queue_manager
|
||
)
|
||
|
||
# 设置工作线程工厂函数
|
||
self.worker_manager.set_worker_factory(self._worker)
|
||
|
||
# 设置队列大小获取方法
|
||
self.worker_manager.set_queue_size_getter(self._get_queue_size)
|
||
|
||
# 初始化任务持久化管理器
|
||
self.persistence_manager = TaskPersistenceManager(
|
||
backup_interval=self.backup_interval,
|
||
backup_dir=self.backup_dir,
|
||
max_backups=self.max_backups
|
||
)
|
||
|
||
# 初始化定时任务管理器
|
||
self.periodic_task_manager = PeriodicTaskManager()
|
||
|
||
# 设置定时任务回调函数
|
||
self.periodic_task_manager.set_run_task_callback(self._run_periodic_task)
|
||
|
||
# 监控任务
|
||
self.monitor_task = None
|
||
|
||
async def start(self, worker_count: int = None) -> None:
|
||
"""
|
||
启动调度器
|
||
|
||
Args:
|
||
worker_count: 初始工作线程数量,默认为min_workers
|
||
"""
|
||
if self.is_running:
|
||
logger.warning("增强版任务调度器已经在运行中")
|
||
return
|
||
|
||
self.is_running = True
|
||
|
||
# 启动优先级队列管理器
|
||
# 无需显式启动
|
||
|
||
# 启动工作线程管理器
|
||
initial_workers = worker_count if worker_count is not None else self.worker_min_count
|
||
await self.worker_manager.start(initial_workers)
|
||
|
||
# 启动任务持久化管理器
|
||
await self.persistence_manager.start()
|
||
|
||
# 启动定时任务管理器
|
||
await self.periodic_task_manager.start()
|
||
|
||
# 启动监控任务
|
||
self.monitor_task = asyncio.create_task(self._monitor())
|
||
|
||
# 恢复未完成的任务
|
||
await self._restore_pending_tasks()
|
||
|
||
logger.info(f"增强版任务调度器启动成功,工作线程数: {initial_workers}")
|
||
|
||
async def stop(self) -> None:
|
||
"""
|
||
停止调度器
|
||
"""
|
||
if not self.is_running:
|
||
logger.warning("增强版任务调度器未在运行")
|
||
return
|
||
|
||
self.is_running = False
|
||
|
||
# 取消监控任务
|
||
if self.monitor_task:
|
||
self.monitor_task.cancel()
|
||
try:
|
||
await self.monitor_task
|
||
except asyncio.CancelledError:
|
||
pass
|
||
self.monitor_task = None
|
||
|
||
# 停止定时任务管理器
|
||
await self.periodic_task_manager.stop()
|
||
|
||
# 停止任务持久化管理器
|
||
await self.persistence_manager.stop()
|
||
|
||
# 停止工作线程管理器
|
||
await self.worker_manager.stop()
|
||
|
||
# 取消所有正在运行的任务
|
||
for task_id, executor in list(self.running_tasks.items()):
|
||
try:
|
||
await executor.cancel()
|
||
except Exception as e:
|
||
logger.error(f"取消任务 {task_id} 异常: {str(e)}")
|
||
|
||
self.running_tasks.clear()
|
||
|
||
logger.info("增强版任务调度器已停止")
|
||
|
||
async def _restore_pending_tasks(self) -> None:
|
||
"""
|
||
恢复未完成的任务
|
||
只恢复真正需要恢复的任务,避免重复加载已处理过的任务
|
||
"""
|
||
try:
|
||
# 加载未完成的任务
|
||
pending_tasks = await self.persistence_manager.load_pending_tasks()
|
||
if not pending_tasks:
|
||
logger.info("无待恢复的任务")
|
||
return
|
||
|
||
# 筛选需要恢复的任务
|
||
tasks_to_restore = {}
|
||
tasks_to_remove = []
|
||
|
||
async with get_async_session() as session:
|
||
for task_id, task_data in pending_tasks.items():
|
||
# 跳过正在运行的任务
|
||
if task_id in self.running_tasks:
|
||
continue
|
||
|
||
# 查询数据库中的当前状态
|
||
result = await session.execute(
|
||
select(VWEDTaskRecord).where(VWEDTaskRecord.id == task_id)
|
||
)
|
||
task_record = result.scalars().first()
|
||
|
||
if not task_record:
|
||
# 数据库中任务不存在,从持久化中移除
|
||
tasks_to_remove.append(task_id)
|
||
continue
|
||
|
||
# 检查任务状态
|
||
current_status = task_record.status
|
||
# 只恢复这些状态的任务:
|
||
# 1001:执行中(可能是上次异常退出)
|
||
# 1002:排队中
|
||
if current_status in [TaskStatus.RUNNING, TaskStatus.QUEUED]:
|
||
tasks_to_restore[task_id] = task_data
|
||
else:
|
||
# 其他状态的任务(如已完成、失败、取消)不需要恢复
|
||
tasks_to_remove.append(task_id)
|
||
logger.debug(f"任务 {task_id} 状态为 {current_status},不需要恢复")
|
||
|
||
# 删除不需要恢复的任务
|
||
for task_id in tasks_to_remove:
|
||
await self.persistence_manager.remove_task(task_id)
|
||
|
||
# 提交需要恢复的任务到队列
|
||
for task_id, task_data in tasks_to_restore.items():
|
||
priority = task_data.get("priority", PeriodicTaskStatus.PERIODIC)
|
||
|
||
# 提交到队列
|
||
await self.queue_manager.enqueue(task_id, priority)
|
||
|
||
# 添加到持久化列表
|
||
await self.persistence_manager.add_task(task_id, priority, task_data.get("info", {}))
|
||
|
||
logger.info(f"恢复任务 {task_id}, 优先级: {priority}")
|
||
|
||
logger.info(f"已恢复 {len(tasks_to_restore)} 个未完成的任务,移除 {len(tasks_to_remove)} 个不需要恢复的任务")
|
||
|
||
except Exception as e:
|
||
logger.error(f"恢复未完成任务异常: {str(e)}")
|
||
logger.error(traceback.format_exc())
|
||
|
||
async def submit_task(self, task_record_id: str) -> Dict[str, Any]:
|
||
"""
|
||
提交任务到队列
|
||
|
||
Args:
|
||
task_record_id: 任务记录ID
|
||
|
||
Returns:
|
||
Dict[str, Any]: 提交结果
|
||
"""
|
||
# 检查任务是否已在队列中或正在执行
|
||
if task_record_id in self.running_tasks:
|
||
return {
|
||
"success": False,
|
||
"message": "任务已在执行中",
|
||
"taskRecordId": task_record_id
|
||
}
|
||
|
||
# 检查任务记录是否存在
|
||
async with get_async_session() as session:
|
||
result = await session.execute(
|
||
select(VWEDTaskRecord).where(VWEDTaskRecord.id == task_record_id)
|
||
)
|
||
task_record = result.scalars().first()
|
||
|
||
if not task_record:
|
||
return {
|
||
"success": False,
|
||
"message": "任务记录不存在",
|
||
"taskRecordId": task_record_id
|
||
}
|
||
|
||
# 检查任务状态
|
||
if task_record.status in [TaskStatus.COMPLETED, TaskStatus.FAILED, TaskStatus.CANCELED]: # 完成、失败、取消
|
||
return {
|
||
"success": False,
|
||
"message": f"任务已结束,状态码: {task_record.status}",
|
||
"taskRecordId": task_record_id,
|
||
"status": task_record.status
|
||
}
|
||
|
||
# 获取任务优先级
|
||
priority = task_record.priority or PeriodicTaskStatus.PERIODIC # 如果为None或0,使用默认值1
|
||
|
||
# 更新任务状态为排队中
|
||
task_record.status = TaskStatus.QUEUED # 排队中
|
||
await session.commit()
|
||
|
||
# 构建任务信息
|
||
try:
|
||
input_params = json.loads(task_record.input_params) if task_record.input_params else {}
|
||
except Exception:
|
||
input_params = {}
|
||
|
||
task_info = {
|
||
"id": task_record.id,
|
||
"def_id": task_record.def_id,
|
||
"def_label": task_record.def_label,
|
||
"def_version": task_record.def_version,
|
||
"status": task_record.status,
|
||
"created_at": task_record.created_at,
|
||
"priority": priority,
|
||
"periodic_task": task_record.periodic_task,
|
||
"input_params": input_params
|
||
}
|
||
|
||
# 添加到持久化管理器
|
||
await self.persistence_manager.add_task(task_record_id, priority, task_info)
|
||
|
||
# 添加到队列
|
||
queue_index = await self.queue_manager.enqueue(task_record_id, priority)
|
||
|
||
response = {
|
||
"success": True,
|
||
"message": "任务已提交到队列",
|
||
"taskRecordId": task_record_id,
|
||
"queueIndex": queue_index,
|
||
"queueSize": self._get_queue_size(),
|
||
"priority": priority
|
||
}
|
||
|
||
# 如果所有工作线程都在工作中,添加警告信息
|
||
use_workers = [worker_id for worker_id, status in self.worker_manager.worker_status.items() if status.get("current_task") is not None]
|
||
if len(use_workers) == len(self.worker_manager.worker_status):
|
||
response["warning"] = f"警告:所有工作线程({len(self.worker_manager.worker_status)}个)都在忙碌中,任务可能需要等待较长时间"
|
||
logger.warning(f"提交任务 {task_record_id} 时所有工作线程都在忙碌中,当前工作线程数: {len(self.worker_manager.worker_status)}")
|
||
return response
|
||
|
||
async def run_task(self, task_def_id: str, params: List[Dict[str, Any]] = None, parent_task_id: str = None,
|
||
root_task_id: str = None, source_type: int = None, source_system: str = None,
|
||
source_device: str = None, source_time: datetime = None, source_ip: str = None,
|
||
source_client_info: str = None, tf_api_token: str = None, map_id: str = None) -> Dict[str, Any]:
|
||
"""
|
||
运行任务
|
||
创建任务记录并提交到队列
|
||
|
||
Args:
|
||
task_def_id: 任务定义ID
|
||
params: 任务参数
|
||
parent_task_id: 父任务记录ID,用于任务链
|
||
root_task_id: 根任务记录ID,用于任务树追踪
|
||
source_type: 任务来源类型
|
||
source_system: 来源系统标识
|
||
source_device: 下达任务的硬件设备标识
|
||
source_time: 任务下达时间
|
||
source_ip: 下达任务的IP地址
|
||
source_client_info: 客户端设备信息
|
||
tf_api_token: 主任务系统API Token
|
||
map_id: 相关地图ID
|
||
Returns:
|
||
Dict[str, Any]: 运行结果
|
||
"""
|
||
try:
|
||
# 检查任务定义是否存在
|
||
async with get_async_session() as session:
|
||
result = await session.execute(
|
||
select(VWEDTaskDef).where(VWEDTaskDef.id == task_def_id)
|
||
)
|
||
task_def = result.scalars().first()
|
||
if not task_def:
|
||
return {
|
||
"success": False,
|
||
"message": "任务定义不存在",
|
||
"taskDefId": task_def_id
|
||
}
|
||
map_id = task_def.map_id
|
||
user_token = task_def.user_token
|
||
# 创建任务记录
|
||
task_record_id = str(uuid.uuid4())
|
||
task_record = VWEDTaskRecord(
|
||
id=task_record_id,
|
||
def_id=task_def.id,
|
||
def_label=task_def.label,
|
||
def_version=task_def.version,
|
||
parent_task_record_id=parent_task_id,
|
||
root_task_record_id=root_task_id,
|
||
status=TaskStatus.QUEUED, # 队列中
|
||
input_params=json.dumps(params or [], ensure_ascii=False) if params else None,
|
||
periodic_task=task_def.periodic_task or PeriodicTaskStatus.NON_PERIODIC,
|
||
task_def_detail=task_def.detail,
|
||
source_type=source_type,
|
||
source_system=source_system,
|
||
source_device=source_device,
|
||
source_time=source_time,
|
||
source_ip=source_ip,
|
||
source_client_info=source_client_info
|
||
)
|
||
|
||
# 保存到数据库
|
||
session.add(task_record)
|
||
await session.flush()
|
||
|
||
# 获取任务记录的优先级
|
||
priority = task_record.priority or PeriodicTaskStatus.PERIODIC
|
||
|
||
# 提交到队列
|
||
result = await self.submit_task(task_record_id)
|
||
select_agv_type = self.check_task_def_select_agv(task_def.detail)
|
||
# 同步任务到主任务系统
|
||
try:
|
||
# 导入同步服务
|
||
from services.sync_service import create_task as tf_create_task
|
||
# 调用主任务系统创建任务接口
|
||
sync_response = await tf_create_task(
|
||
task_record_id=task_record.id,
|
||
task_name=task_record.def_label,
|
||
is_periodic=task_record.periodic_task,
|
||
priority=task_record.priority,
|
||
parent_id=task_record.parent_task_record_id if task_record.parent_task_record_id else "",
|
||
token=user_token,
|
||
map_id=map_id,
|
||
is_agv=select_agv_type
|
||
)
|
||
if sync_response and sync_response.get("success"):
|
||
logger.info(f"成功同步任务到主任务系统: {task_record_id}")
|
||
else:
|
||
logger.warning(f"同步任务到主任务系统失败: {task_record_id}")
|
||
except Exception as e:
|
||
logger.error(f"同步任务到主任务系统时发生错误: {str(e)}")
|
||
# 不影响主流程,继续执行
|
||
|
||
|
||
return {
|
||
"success": True,
|
||
"message": "任务已创建并提交到队列",
|
||
"taskRecordId": task_record_id,
|
||
"status": TaskStatus.QUEUED,
|
||
"priority": priority,
|
||
"createTime": datetime.now(),
|
||
"queueResult": result
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.error(f"运行任务失败: {str(e)}")
|
||
return {
|
||
"success": False,
|
||
"message": f"运行任务失败: {str(e)}",
|
||
"taskDefId": task_def_id
|
||
}
|
||
|
||
async def cancel_task(self, task_record_id: str) -> Dict[str, Any]:
|
||
"""
|
||
取消任务
|
||
|
||
Args:
|
||
task_record_id: 任务记录ID
|
||
|
||
Returns:
|
||
Dict[str, Any]: 取消结果
|
||
"""
|
||
logger.info(f"准备取消任务: {task_record_id}")
|
||
|
||
# 检查任务记录是否存在
|
||
async with get_async_session() as session:
|
||
result = await session.execute(
|
||
select(VWEDTaskRecord).where(VWEDTaskRecord.id == task_record_id)
|
||
)
|
||
task_record = result.scalars().first()
|
||
if not task_record:
|
||
return {
|
||
"success": False,
|
||
"message": "任务记录不存在",
|
||
"taskRecordId": task_record_id
|
||
}
|
||
|
||
# 如果任务已经结束,不能取消
|
||
if task_record.status in [TaskStatus.COMPLETED, TaskStatus.FAILED, TaskStatus.CANCELED]: # 完成、失败、取消
|
||
return {
|
||
"success": False,
|
||
"message": f"任务已结束,状态码: {task_record.status}",
|
||
"taskRecordId": task_record_id,
|
||
"status": task_record.status
|
||
}
|
||
|
||
# 查找所有正在运行的子任务
|
||
child_tasks_query = await session.execute(
|
||
select(VWEDTaskRecord).where(
|
||
VWEDTaskRecord.parent_task_record_id == task_record_id,
|
||
VWEDTaskRecord.status == TaskStatus.RUNNING # 执行中状态码
|
||
)
|
||
)
|
||
child_tasks = child_tasks_query.scalars().all()
|
||
|
||
# 记录取消子任务的结果
|
||
child_tasks_results = []
|
||
|
||
# 先取消所有子任务
|
||
for child_task in child_tasks:
|
||
logger.info(f"取消子任务: {child_task.id}, 父任务: {task_record_id}")
|
||
|
||
# 递归调用,取消子任务(可能有孙子任务)
|
||
child_cancel_result = await self.cancel_task(child_task.id)
|
||
|
||
child_tasks_results.append({
|
||
"taskRecordId": child_task.id,
|
||
"success": child_cancel_result.get("success", False),
|
||
"message": child_cancel_result.get("message", "未知结果")
|
||
})
|
||
|
||
# 检查任务是否在执行中
|
||
if task_record_id in self.running_tasks:
|
||
executor = self.running_tasks[task_record_id]
|
||
result = await executor.cancel()
|
||
|
||
# 从持久化管理器中移除
|
||
await self.persistence_manager.remove_task(task_record_id)
|
||
|
||
# 添加子任务取消结果
|
||
if child_tasks_results:
|
||
result["childTasksResults"] = child_tasks_results
|
||
|
||
|
||
return result
|
||
|
||
# 如果不在执行中,直接更新状态
|
||
async with get_async_session() as session:
|
||
# 更新状态为取消
|
||
task_record.status = TaskStatus.CANCELED # 取消
|
||
task_record.ended_reason = "任务被取消"
|
||
task_record.ended_on = datetime.now()
|
||
task_record.allow_restart_same_location = True # 设置为True,允许相同地址再次启动任务
|
||
|
||
await session.commit()
|
||
|
||
# 从持久化管理器中移除
|
||
await self.persistence_manager.remove_task(task_record_id)
|
||
|
||
# 更新任务定义状态为普通状态(0)
|
||
# 获取def_id
|
||
def_id = task_record.def_id
|
||
|
||
# 使用新会话更新任务定义状态,避免事务冲突
|
||
async with get_async_session() as new_session:
|
||
await new_session.execute(
|
||
update(VWEDTaskDef)
|
||
.where(VWEDTaskDef.id == def_id)
|
||
.values(status=TaskStatusEnum.PENDING)
|
||
)
|
||
await new_session.commit()
|
||
logger.info(f"更新任务定义状态为普通状态: {def_id}")
|
||
|
||
result = {
|
||
"success": True,
|
||
"message": "任务已取消",
|
||
"taskRecordId": task_record_id
|
||
}
|
||
|
||
# 添加子任务取消结果
|
||
if child_tasks_results:
|
||
result["childTasksResults"] = child_tasks_results
|
||
|
||
return result
|
||
|
||
async def get_task_status(self, task_record_id: str) -> Dict[str, Any]:
|
||
"""
|
||
获取任务状态
|
||
|
||
Args:
|
||
task_record_id: 任务记录ID
|
||
|
||
Returns:
|
||
Dict[str, Any]: 任务状态信息
|
||
"""
|
||
try:
|
||
async with get_async_session() as session:
|
||
result = await session.execute(
|
||
select(VWEDTaskRecord).where(VWEDTaskRecord.id == task_record_id)
|
||
)
|
||
task_record = result.scalars().first()
|
||
|
||
if not task_record:
|
||
return {
|
||
"success": False,
|
||
"message": "任务记录不存在",
|
||
"taskRecordId": task_record_id
|
||
}
|
||
|
||
# 获取当前队列中的任务数量
|
||
in_queue = await self._task_in_queue(task_record_id)
|
||
|
||
# 格式化任务信息
|
||
status_info = {
|
||
"taskRecordId": task_record.id,
|
||
"taskDefId": task_record.def_id,
|
||
"defLabel": task_record.def_label,
|
||
"defVersion": task_record.def_version,
|
||
"status": task_record.status,
|
||
"createdOn": task_record.created_at,
|
||
"endedOn": task_record.ended_on,
|
||
"executorTime": task_record.executor_time,
|
||
"firstExecutorTime": task_record.first_executor_time,
|
||
"endedReason": task_record.ended_reason,
|
||
"priority": task_record.priority,
|
||
"periodicTask": task_record.periodic_task,
|
||
"runningInCurrentQueue": task_record_id in self.running_tasks,
|
||
"inQueue": in_queue,
|
||
"retryCount": getattr(task_record, "retry_count", 0)
|
||
}
|
||
|
||
return {
|
||
"success": True,
|
||
"data": status_info
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.error(f"获取任务状态失败: {str(e)}")
|
||
return {
|
||
"success": False,
|
||
"message": f"获取任务状态失败: {str(e)}",
|
||
"taskRecordId": task_record_id
|
||
}
|
||
|
||
async def update_periodic_task(self, task_def_id: str, enable: bool = None) -> Dict[str, Any]:
|
||
"""
|
||
更新定时任务状态
|
||
|
||
Args:
|
||
task_def_id: 任务定义ID
|
||
enable: 是否启用
|
||
|
||
Returns:
|
||
Dict[str, Any]: 更新结果
|
||
"""
|
||
return await self.periodic_task_manager.update_task(task_def_id, enable)
|
||
|
||
async def _run_periodic_task(self, task_def_id: str) -> Dict[str, Any]:
|
||
"""
|
||
运行定时任务
|
||
|
||
Args:
|
||
task_def_id: 任务定义ID
|
||
|
||
Returns:
|
||
Dict[str, Any]: 运行结果
|
||
"""
|
||
# 对于定时任务,生成系统自动任务来源信息
|
||
now = datetime.now()
|
||
source_type = SourceType.SYSTEM_SCHEDULING # 1: 系统调度
|
||
source_system = "SCHEDULER"
|
||
source_device = "SYSTEM"
|
||
source_time = now
|
||
source_ip = "127.0.0.1" # 本地调度器IP
|
||
source_client_info = json.dumps({"system": "SCHEDULER", "type": "PERIODIC"}, ensure_ascii=False)
|
||
|
||
return await self.run_task(
|
||
task_def_id=task_def_id,
|
||
source_type=source_type,
|
||
source_system=source_system,
|
||
source_device=source_device,
|
||
source_time=source_time,
|
||
source_ip=source_ip,
|
||
source_client_info=source_client_info
|
||
)
|
||
|
||
async def _worker(self, worker_id: int) -> None:
|
||
"""
|
||
工作线程
|
||
从队列中获取任务并执行
|
||
|
||
Args:
|
||
worker_id: 工作线程ID
|
||
"""
|
||
logger.info(f"工作线程 {worker_id} 启动")
|
||
|
||
while self.is_running:
|
||
try:
|
||
# 从队列获取任务
|
||
queue_index, item = await self.queue_manager.dequeue(worker_id, self.worker_manager.get_worker_count())
|
||
|
||
# 如果没有任务,继续等待
|
||
if queue_index == -1 or item is None:
|
||
await asyncio.sleep(0.1) # 短暂休眠
|
||
continue
|
||
|
||
# 解析优先级和任务ID
|
||
if isinstance(item, tuple) and len(item) == 2:
|
||
priority, task_record_id = item
|
||
# 将优先级转回正值
|
||
priority = -priority
|
||
else:
|
||
# 兼容旧格式
|
||
task_record_id = item
|
||
priority = PeriodicTaskStatus.PERIODIC
|
||
|
||
# 更新工作线程状态
|
||
self.worker_manager.update_worker_status(worker_id, {
|
||
"current_task": task_record_id,
|
||
"task_priority": priority,
|
||
"task_start_time": datetime.now()
|
||
})
|
||
|
||
logger.info(f"工作线程 {worker_id} 获取到任务: {task_record_id}, 优先级: {priority}")
|
||
|
||
# 执行任务
|
||
try:
|
||
# 创建任务执行器
|
||
executor = TaskExecutor(task_record_id)
|
||
|
||
# 设置超时时间
|
||
executor.set_timeout(self.task_timeout)
|
||
|
||
# 记录到正在执行的任务
|
||
self.running_tasks[task_record_id] = executor
|
||
|
||
# 从持久化管理器中移除(正在执行的任务不需要持久化)
|
||
await self.persistence_manager.remove_task(task_record_id)
|
||
|
||
# 创建一个取消任务检查器,定期检查数据库中任务是否被标记为取消
|
||
cancel_checker_task = asyncio.create_task(self._check_task_cancel(task_record_id, executor))
|
||
|
||
# 执行任务
|
||
result = await executor.execute()
|
||
|
||
# 取消检查器任务
|
||
cancel_checker_task.cancel()
|
||
try:
|
||
await cancel_checker_task
|
||
except asyncio.CancelledError:
|
||
pass
|
||
|
||
# 更新工作线程状态
|
||
self.worker_manager.update_worker_status(worker_id, {
|
||
"current_task": None,
|
||
"last_task": task_record_id,
|
||
"last_task_result": result.get("success", False),
|
||
"task_count": self.worker_manager.worker_status[worker_id].get("task_count", 0) + 1
|
||
})
|
||
|
||
# 移除正在执行的任务
|
||
self.running_tasks.pop(task_record_id, None)
|
||
|
||
logger.info(f"工作线程 {worker_id} 完成任务: {task_record_id}, 结果: {result.get('success')}")
|
||
|
||
except Exception as e:
|
||
logger.error(f"工作线程 {worker_id} 执行任务异常: {str(e)}")
|
||
logger.error(traceback.format_exc())
|
||
|
||
# 移除正在执行的任务
|
||
self.running_tasks.pop(task_record_id, None)
|
||
|
||
# 检查是否需要重试
|
||
await self._handle_task_error(task_record_id, str(e))
|
||
|
||
# 更新工作线程状态
|
||
self.worker_manager.update_worker_status(worker_id, {
|
||
"current_task": None,
|
||
"last_task": task_record_id,
|
||
"last_task_result": False,
|
||
"last_error": str(e),
|
||
"task_count": self.worker_manager.worker_status[worker_id].get("task_count", 0) + 1
|
||
})
|
||
|
||
# 标记任务完成
|
||
self.queue_manager.task_done(queue_index)
|
||
|
||
# 更新工作线程心跳
|
||
self.worker_manager.update_worker_heartbeat(worker_id)
|
||
|
||
except asyncio.TimeoutError:
|
||
# 超时,继续下一次循环
|
||
continue
|
||
except asyncio.CancelledError:
|
||
# 取消异常,退出循环
|
||
logger.info(f"工作线程 {worker_id} 被取消")
|
||
break
|
||
except Exception as e:
|
||
logger.error(f"工作线程 {worker_id} 异常: {str(e)}")
|
||
logger.error(traceback.format_exc())
|
||
|
||
# 更新工作线程状态
|
||
self.worker_manager.update_worker_status(worker_id, {
|
||
"error": str(e),
|
||
"error_time": datetime.now()
|
||
})
|
||
|
||
# 短暂休眠,避免频繁错误
|
||
await asyncio.sleep(1.0)
|
||
|
||
logger.info(f"工作线程 {worker_id} 结束")
|
||
|
||
async def _handle_task_error(self, task_record_id: str, error_message: str) -> None:
|
||
"""
|
||
处理任务执行错误
|
||
|
||
Args:
|
||
task_record_id: 任务记录ID
|
||
error_message: 错误信息
|
||
"""
|
||
try:
|
||
async with get_async_session() as session:
|
||
# 获取任务记录
|
||
result = await session.execute(
|
||
select(VWEDTaskRecord).where(VWEDTaskRecord.id == task_record_id)
|
||
)
|
||
task_record = result.scalars().first()
|
||
|
||
if not task_record:
|
||
logger.error(f"任务记录不存在: {task_record_id}")
|
||
return
|
||
|
||
# 获取重试次数
|
||
retry_count = getattr(task_record, "retry_count", 0) or 0
|
||
|
||
# 判断是否需要重试
|
||
if retry_count < self.max_retry_count:
|
||
# 更新重试次数
|
||
task_record.retry_count = retry_count + 1
|
||
task_record.status = TaskStatus.QUEUED # 排队中
|
||
task_record.ended_reason = f"任务执行异常,准备第{retry_count + 1}次重试: {error_message}"
|
||
|
||
# 提交更新
|
||
await session.commit()
|
||
|
||
# 计算重试延迟时间(使用指数退避算法)
|
||
retry_delay = self.retry_delay * (2 ** retry_count)
|
||
|
||
# 创建异步任务,延迟重试
|
||
asyncio.create_task(self._delayed_retry(task_record_id, retry_delay))
|
||
|
||
logger.info(f"任务 {task_record_id} 将在 {retry_delay} 秒后进行第 {retry_count + 1} 次重试")
|
||
|
||
else:
|
||
# 重试次数已达上限,更新状态为失败
|
||
task_record.status = TaskStatus.FAILED # 失败
|
||
task_record.ended_reason = f"任务执行异常,重试{retry_count}次后失败: {error_message}"
|
||
task_record.ended_on = datetime.now()
|
||
task_record.allow_restart_same_location = True # 设置为True,允许相同地址再次启动任务
|
||
|
||
# 提交更新
|
||
await session.commit()
|
||
|
||
# 更新任务定义状态为普通状态(0)
|
||
# 获取def_id
|
||
def_id = task_record.def_id
|
||
|
||
# 使用新会话更新任务定义状态
|
||
async with get_async_session() as new_session:
|
||
await new_session.execute(
|
||
update(VWEDTaskDef)
|
||
.where(VWEDTaskDef.id == def_id)
|
||
.values(status=TaskStatusEnum.PENDING)
|
||
)
|
||
await new_session.commit()
|
||
logger.info(f"更新任务定义状态为普通状态: {def_id}")
|
||
|
||
logger.info(f"任务 {task_record_id} 重试{retry_count}次后失败")
|
||
|
||
except Exception as e:
|
||
logger.error(f"处理任务错误异常: {str(e)}")
|
||
logger.error(traceback.format_exc())
|
||
|
||
async def _delayed_retry(self, task_record_id: str, delay_seconds: int) -> None:
|
||
"""
|
||
延迟重试任务
|
||
|
||
Args:
|
||
task_record_id: 任务记录ID
|
||
delay_seconds: 延迟秒数
|
||
"""
|
||
try:
|
||
# 延迟等待
|
||
await asyncio.sleep(delay_seconds)
|
||
|
||
# 重新提交任务
|
||
await self.submit_task(task_record_id)
|
||
|
||
logger.info(f"任务 {task_record_id} 已重新提交")
|
||
|
||
except Exception as e:
|
||
logger.error(f"延迟重试任务异常: {str(e)}")
|
||
|
||
async def _monitor(self) -> None:
|
||
"""
|
||
监控任务
|
||
定期检查任务状态,清理僵尸任务
|
||
"""
|
||
logger.info("任务监控启动")
|
||
|
||
while self.is_running:
|
||
try:
|
||
# 检查僵尸任务
|
||
await self._check_zombie_tasks()
|
||
|
||
# 休眠一段时间
|
||
await asyncio.sleep(settings.TASK_SCHEDULER_ZOMBIE_TASK_CHECK_INTERVAL) # 每分钟检查一次
|
||
|
||
except asyncio.CancelledError:
|
||
# 取消异常,退出循环
|
||
logger.info("任务监控被取消")
|
||
break
|
||
except Exception as e:
|
||
logger.error(f"任务监控异常: {str(e)}")
|
||
# 出现异常时短暂休眠,避免频繁错误
|
||
await asyncio.sleep(5.0)
|
||
|
||
logger.info("任务监控结束")
|
||
|
||
async def _check_zombie_tasks(self) -> None:
|
||
"""
|
||
检查僵尸任务
|
||
查找长时间处于运行中但实际已中断的任务
|
||
"""
|
||
try:
|
||
async with get_async_session() as session:
|
||
# 查询长时间处于运行中状态的任务
|
||
one_hour_ago = datetime.now() - timedelta(hours=1)
|
||
|
||
query = select(VWEDTaskRecord).where(
|
||
VWEDTaskRecord.status == TaskStatus.RUNNING, # 运行中
|
||
VWEDTaskRecord.first_executor_time < one_hour_ago # 开始执行时间超过1小时
|
||
)
|
||
|
||
result = await session.execute(query)
|
||
zombie_tasks = result.scalars().all()
|
||
|
||
for task in zombie_tasks:
|
||
# 如果任务在当前运行列表中,跳过
|
||
if task.id in self.running_tasks:
|
||
continue
|
||
|
||
# 更新任务状态为失败
|
||
task.status = TaskStatus.FAILED # 失败
|
||
task.ended_reason = "任务执行超时,被系统自动终止"
|
||
task.ended_on = datetime.now()
|
||
task.allow_restart_same_location = True # 设置为True,允许相同地址再次启动任务
|
||
|
||
if zombie_tasks:
|
||
# 提交更新
|
||
await session.commit()
|
||
logger.info(f"清理 {len(zombie_tasks)} 个僵尸任务")
|
||
|
||
except Exception as e:
|
||
logger.error(f"检查僵尸任务异常: {str(e)}")
|
||
|
||
async def _task_in_queue(self, task_record_id: str) -> bool:
|
||
"""
|
||
检查任务是否在队列中
|
||
|
||
Args:
|
||
task_record_id: 任务记录ID
|
||
|
||
Returns:
|
||
bool: 是否在队列中
|
||
"""
|
||
return task_record_id in self.queue_manager.priority_map
|
||
|
||
def _get_queue_size(self) -> int:
|
||
"""
|
||
获取队列大小
|
||
|
||
Returns:
|
||
int: 队列大小
|
||
"""
|
||
queue_sizes = self.queue_manager.get_queue_sizes()
|
||
return sum(queue_sizes)
|
||
|
||
def get_scheduler_status(self) -> Dict[str, Any]:
|
||
"""
|
||
获取调度器状态
|
||
|
||
Returns:
|
||
Dict[str, Any]: 调度器状态信息
|
||
"""
|
||
queue_status = self.queue_manager.get_queue_status()
|
||
worker_status = self.worker_manager.get_worker_status()
|
||
periodic_status = self.periodic_task_manager.get_task_status()
|
||
backup_status = self.persistence_manager.get_backup_status()
|
||
|
||
return {
|
||
"is_running": self.is_running,
|
||
"running_tasks": len(self.running_tasks),
|
||
"queue_status": queue_status,
|
||
"worker_status": worker_status,
|
||
"periodic_status": periodic_status,
|
||
"backup_status": backup_status,
|
||
"config": {
|
||
"worker_min_count": self.worker_min_count,
|
||
"worker_max_count": self.worker_max_count,
|
||
"queue_count": self.queue_count,
|
||
"task_timeout": self.task_timeout,
|
||
"max_retry_count": self.max_retry_count,
|
||
"retry_delay": self.retry_delay,
|
||
}
|
||
}
|
||
|
||
async def _check_task_cancel(self, task_record_id: str, executor: 'TaskExecutor') -> None:
|
||
"""
|
||
定期检查任务是否被标记为取消
|
||
|
||
Args:
|
||
task_record_id: 任务记录ID
|
||
executor: 任务执行器
|
||
"""
|
||
from data.models.taskrecord import VWEDTaskRecord
|
||
from data.enum.task_record_enum import TaskStatus
|
||
from data.session import get_async_session
|
||
from sqlalchemy import select
|
||
|
||
check_interval = 1.0 # 每秒检查一次
|
||
|
||
try:
|
||
while True:
|
||
# 等待一段时间
|
||
await asyncio.sleep(check_interval)
|
||
|
||
# 查询数据库中的任务状态
|
||
async with get_async_session() as session:
|
||
result = await session.execute(
|
||
select(VWEDTaskRecord.status).where(VWEDTaskRecord.id == task_record_id)
|
||
)
|
||
status = result.scalar_one_or_none()
|
||
|
||
# 如果任务被标记为取消,则取消执行
|
||
if status == TaskStatus.CANCELED:
|
||
logger.info(f"检测到任务 {task_record_id} 被标记为取消,正在停止执行")
|
||
await executor.cancel()
|
||
return
|
||
except Exception as e:
|
||
logger.error(f"检查任务取消状态时出错: {str(e)}")
|
||
|
||
def check_task_def_select_agv(self, detail_json: str) -> int:
|
||
"""
|
||
检查任务定义详情中是否包含SELECT_AGV块类型
|
||
|
||
Args:
|
||
detail_json: 任务定义详情JSON字符串
|
||
|
||
Returns:
|
||
int: 存在返回1,不存在返回0
|
||
"""
|
||
try:
|
||
# 解析JSON字符串
|
||
if not detail_json:
|
||
return 0
|
||
|
||
detail = json.loads(detail_json)
|
||
if not detail:
|
||
return 0
|
||
|
||
# 选择AGV块类型常量
|
||
SELECT_AGV_TYPE = RobotBlockName.SELECT_AGV # "CSelectAgvBp"
|
||
|
||
# 检查根块
|
||
root_block = detail.get("rootBlock")
|
||
if not root_block:
|
||
return 0
|
||
|
||
# 递归检查块类型
|
||
def check_block(block):
|
||
# 检查当前块类型
|
||
if block.get("blockType") == SELECT_AGV_TYPE:
|
||
return True
|
||
|
||
# 递归检查子块
|
||
children = block.get("children", {})
|
||
for child_key, child_list in children.items():
|
||
if isinstance(child_list, list):
|
||
for child in child_list:
|
||
if check_block(child):
|
||
return True
|
||
|
||
return False
|
||
|
||
# 从根块开始检查
|
||
if check_block(root_block):
|
||
return 1
|
||
|
||
return 0
|
||
|
||
except json.JSONDecodeError:
|
||
logger.error(f"解析任务定义详情JSON失败: {detail_json[:100]}...")
|
||
return 0
|
||
except Exception as e:
|
||
logger.error(f"检查任务定义中SELECT_AGV类型异常: {str(e)}")
|
||
return 0
|
||
|
||
# 创建全局调度器实例
|
||
scheduler = EnhancedTaskScheduler() |