2025-05-12 15:43:21 +08:00

1133 lines
45 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
增强版任务调度器模块
负责任务的高级调度和执行
实现了优先级队列、工作线程管理、任务持久化等特性
"""
import asyncio
import logging
import json
import os
import uuid
import time
import traceback
from typing import Dict, List, Any, Optional, Set, Tuple, Union
from datetime import datetime, timedelta
from sqlalchemy import select, update, func
from sqlalchemy.ext.asyncio import AsyncSession
from config.settings import settings
from data.models.taskdef import VWEDTaskDef
from data.models.taskrecord import VWEDTaskRecord
from data.session import get_async_session
from services.execution.task_executor import TaskExecutor
from services.execution.handlers.model.block_name import RobotBlockName
from services.enhanced_scheduler.priority_queue_manager import PriorityQueueManager
from services.enhanced_scheduler.worker_manager import WorkerManager
from services.enhanced_scheduler.task_persistence import TaskPersistenceManager
from services.enhanced_scheduler.periodic_task_manager import PeriodicTaskManager
from data.enum.task_record_enum import TaskStatus, SourceType
from data.enum.task_def_enum import EnableStatus, PeriodicTaskStatus, TaskStatusEnum
from utils.logger import get_logger
# 获取日志记录器
logger = get_logger("services.enhanced_scheduler.task_scheduler")
class EnhancedTaskScheduler:
"""
增强版任务调度器类
实现了高级任务调度功能
"""
_instance = None
def __new__(cls):
"""
单例模式实现
"""
if cls._instance is None:
cls._instance = super(EnhancedTaskScheduler, cls).__new__(cls)
cls._instance._initialized = False
return cls._instance
def __init__(self):
"""
初始化调度器
"""
if not self._initialized:
# 基本属性
self.running_tasks = {} # 正在运行的任务 {task_id: executor}
self.is_running = False # 调度器是否正在运行
# 配置信息
self._load_config()
# 初始化组件
self._init_components()
self._initialized = True
logger.info("增强版任务调度器初始化完成")
def _load_config(self):
"""
加载配置信息
"""
# 从配置文件加载设置
self.worker_min_count = settings.TASK_SCHEDULER_MIN_WORKER_COUNT # 最小工作线程数
self.worker_max_count = settings.TASK_SCHEDULER_MAX_WORKER_COUNT # 最大工作线程数
self.queue_count = settings.TASK_SCHEDULER_QUEUE_COUNT # 队列数量
# 队列阈值百分比配置
self.queue_threshold_percentiles = settings.TASK_SCHEDULER_QUEUE_THRESHOLD_PERCENTILES
# 工作线程分配比例
self.worker_ratios = settings.TASK_SCHEDULER_WORKER_RATIOS
# 任务超时时间(秒)
self.task_timeout = settings.TASK_SCHEDULER_TASK_TIMEOUT
# 任务重试配置
self.max_retry_count = settings.TASK_SCHEDULER_MAX_RETRY_COUNT
self.retry_delay = settings.TASK_SCHEDULER_RETRY_DELAY
# 备份配置
self.backup_interval = settings.TASK_SCHEDULER_BACKUP_INTERVAL
self.backup_dir = settings.TASK_SCHEDULER_BACKUP_DIR
self.max_backups = settings.TASK_SCHEDULER_MAX_BACKUPS
# 自动扩缩容配置
self.cpu_threshold = settings.TASK_SCHEDULER_CPU_THRESHOLD
self.memory_threshold = settings.TASK_SCHEDULER_MEMORY_THRESHOLD
self.auto_scale_interval = settings.TASK_SCHEDULER_AUTO_SCALE_INTERVAL
self.worker_heartbeat_interval = settings.TASK_SCHEDULER_WORKER_HEARTBEAT_INTERVAL
logger.info(f"加载任务调度器配置: 工作线程数={self.worker_min_count}-{self.worker_max_count}, "
f"队列数={self.queue_count}, 任务超时={self.task_timeout}")
def _init_components(self):
"""
初始化组件
"""
# 初始化优先级队列管理器
self.queue_manager = PriorityQueueManager(
queue_count=self.queue_count,
threshold_percentiles=self.queue_threshold_percentiles,
worker_ratios=self.worker_ratios
)
# 初始化工作线程管理器
self.worker_manager = WorkerManager(
min_workers=self.worker_min_count,
max_workers=self.worker_max_count,
cpu_threshold=self.cpu_threshold,
memory_threshold=self.memory_threshold,
auto_scale_interval=self.auto_scale_interval,
worker_heartbeat_interval=self.worker_heartbeat_interval,
queue_manager=self.queue_manager
)
# 设置工作线程工厂函数
self.worker_manager.set_worker_factory(self._worker)
# 设置队列大小获取方法
self.worker_manager.set_queue_size_getter(self._get_queue_size)
# 初始化任务持久化管理器
self.persistence_manager = TaskPersistenceManager(
backup_interval=self.backup_interval,
backup_dir=self.backup_dir,
max_backups=self.max_backups
)
# 初始化定时任务管理器
self.periodic_task_manager = PeriodicTaskManager()
# 设置定时任务回调函数
self.periodic_task_manager.set_run_task_callback(self._run_periodic_task)
# 监控任务
self.monitor_task = None
async def start(self, worker_count: int = None) -> None:
"""
启动调度器
Args:
worker_count: 初始工作线程数量默认为min_workers
"""
if self.is_running:
logger.warning("增强版任务调度器已经在运行中")
return
self.is_running = True
# 启动优先级队列管理器
# 无需显式启动
# 启动工作线程管理器
initial_workers = worker_count if worker_count is not None else self.worker_min_count
await self.worker_manager.start(initial_workers)
# 启动任务持久化管理器
await self.persistence_manager.start()
# 启动定时任务管理器
await self.periodic_task_manager.start()
# 启动监控任务
self.monitor_task = asyncio.create_task(self._monitor())
# 恢复未完成的任务
await self._restore_pending_tasks()
logger.info(f"增强版任务调度器启动成功,工作线程数: {initial_workers}")
async def stop(self) -> None:
"""
停止调度器
"""
if not self.is_running:
logger.warning("增强版任务调度器未在运行")
return
self.is_running = False
# 取消监控任务
if self.monitor_task:
self.monitor_task.cancel()
try:
await self.monitor_task
except asyncio.CancelledError:
pass
self.monitor_task = None
# 停止定时任务管理器
await self.periodic_task_manager.stop()
# 停止任务持久化管理器
await self.persistence_manager.stop()
# 停止工作线程管理器
await self.worker_manager.stop()
# 取消所有正在运行的任务
for task_id, executor in list(self.running_tasks.items()):
try:
await executor.cancel()
except Exception as e:
logger.error(f"取消任务 {task_id} 异常: {str(e)}")
self.running_tasks.clear()
logger.info("增强版任务调度器已停止")
async def _restore_pending_tasks(self) -> None:
"""
恢复未完成的任务
只恢复真正需要恢复的任务,避免重复加载已处理过的任务
"""
try:
# 加载未完成的任务
pending_tasks = await self.persistence_manager.load_pending_tasks()
if not pending_tasks:
logger.info("无待恢复的任务")
return
# 筛选需要恢复的任务
tasks_to_restore = {}
tasks_to_remove = []
async with get_async_session() as session:
for task_id, task_data in pending_tasks.items():
# 跳过正在运行的任务
if task_id in self.running_tasks:
continue
# 查询数据库中的当前状态
result = await session.execute(
select(VWEDTaskRecord).where(VWEDTaskRecord.id == task_id)
)
task_record = result.scalars().first()
if not task_record:
# 数据库中任务不存在,从持久化中移除
tasks_to_remove.append(task_id)
continue
# 检查任务状态
current_status = task_record.status
# 只恢复这些状态的任务:
# 1001:执行中(可能是上次异常退出)
# 1002:排队中
if current_status in [TaskStatus.RUNNING, TaskStatus.QUEUED]:
tasks_to_restore[task_id] = task_data
else:
# 其他状态的任务(如已完成、失败、取消)不需要恢复
tasks_to_remove.append(task_id)
logger.debug(f"任务 {task_id} 状态为 {current_status},不需要恢复")
# 删除不需要恢复的任务
for task_id in tasks_to_remove:
await self.persistence_manager.remove_task(task_id)
# 提交需要恢复的任务到队列
for task_id, task_data in tasks_to_restore.items():
priority = task_data.get("priority", PeriodicTaskStatus.PERIODIC)
# 提交到队列
await self.queue_manager.enqueue(task_id, priority)
# 添加到持久化列表
await self.persistence_manager.add_task(task_id, priority, task_data.get("info", {}))
logger.info(f"恢复任务 {task_id}, 优先级: {priority}")
logger.info(f"已恢复 {len(tasks_to_restore)} 个未完成的任务,移除 {len(tasks_to_remove)} 个不需要恢复的任务")
except Exception as e:
logger.error(f"恢复未完成任务异常: {str(e)}")
logger.error(traceback.format_exc())
async def submit_task(self, task_record_id: str) -> Dict[str, Any]:
"""
提交任务到队列
Args:
task_record_id: 任务记录ID
Returns:
Dict[str, Any]: 提交结果
"""
# 检查任务是否已在队列中或正在执行
if task_record_id in self.running_tasks:
return {
"success": False,
"message": "任务已在执行中",
"taskRecordId": task_record_id
}
# 检查任务记录是否存在
async with get_async_session() as session:
result = await session.execute(
select(VWEDTaskRecord).where(VWEDTaskRecord.id == task_record_id)
)
task_record = result.scalars().first()
if not task_record:
return {
"success": False,
"message": "任务记录不存在",
"taskRecordId": task_record_id
}
# 检查任务状态
if task_record.status in [TaskStatus.COMPLETED, TaskStatus.FAILED, TaskStatus.CANCELED]: # 完成、失败、取消
return {
"success": False,
"message": f"任务已结束,状态码: {task_record.status}",
"taskRecordId": task_record_id,
"status": task_record.status
}
# 获取任务优先级
priority = task_record.priority or PeriodicTaskStatus.PERIODIC # 如果为None或0使用默认值1
# 更新任务状态为排队中
task_record.status = TaskStatus.QUEUED # 排队中
await session.commit()
# 构建任务信息
try:
input_params = json.loads(task_record.input_params) if task_record.input_params else {}
except Exception:
input_params = {}
task_info = {
"id": task_record.id,
"def_id": task_record.def_id,
"def_label": task_record.def_label,
"def_version": task_record.def_version,
"status": task_record.status,
"created_at": task_record.created_at,
"priority": priority,
"periodic_task": task_record.periodic_task,
"input_params": input_params
}
# 添加到持久化管理器
await self.persistence_manager.add_task(task_record_id, priority, task_info)
# 添加到队列
queue_index = await self.queue_manager.enqueue(task_record_id, priority)
response = {
"success": True,
"message": "任务已提交到队列",
"taskRecordId": task_record_id,
"queueIndex": queue_index,
"queueSize": self._get_queue_size(),
"priority": priority
}
# 如果所有工作线程都在工作中,添加警告信息
use_workers = [worker_id for worker_id, status in self.worker_manager.worker_status.items() if status.get("current_task") is not None]
if len(use_workers) == len(self.worker_manager.worker_status):
response["warning"] = f"警告:所有工作线程({len(self.worker_manager.worker_status)}个)都在忙碌中,任务可能需要等待较长时间"
logger.warning(f"提交任务 {task_record_id} 时所有工作线程都在忙碌中,当前工作线程数: {len(self.worker_manager.worker_status)}")
return response
async def run_task(self, task_def_id: str, params: List[Dict[str, Any]] = None, parent_task_id: str = None,
root_task_id: str = None, source_type: int = None, source_system: str = None,
source_device: str = None, source_time: datetime = None, source_ip: str = None,
source_client_info: str = None, tf_api_token: str = None, map_id: str = None) -> Dict[str, Any]:
"""
运行任务
创建任务记录并提交到队列
Args:
task_def_id: 任务定义ID
params: 任务参数
parent_task_id: 父任务记录ID用于任务链
root_task_id: 根任务记录ID用于任务树追踪
source_type: 任务来源类型
source_system: 来源系统标识
source_device: 下达任务的硬件设备标识
source_time: 任务下达时间
source_ip: 下达任务的IP地址
source_client_info: 客户端设备信息
tf_api_token: 主任务系统API Token
map_id: 相关地图ID
Returns:
Dict[str, Any]: 运行结果
"""
try:
# 检查任务定义是否存在
async with get_async_session() as session:
result = await session.execute(
select(VWEDTaskDef).where(VWEDTaskDef.id == task_def_id)
)
task_def = result.scalars().first()
if not task_def:
return {
"success": False,
"message": "任务定义不存在",
"taskDefId": task_def_id
}
map_id = task_def.map_id
user_token = task_def.user_token
# 创建任务记录
task_record_id = str(uuid.uuid4())
task_record = VWEDTaskRecord(
id=task_record_id,
def_id=task_def.id,
def_label=task_def.label,
def_version=task_def.version,
parent_task_record_id=parent_task_id,
root_task_record_id=root_task_id,
status=TaskStatus.QUEUED, # 队列中
input_params=json.dumps(params or [], ensure_ascii=False) if params else None,
periodic_task=task_def.periodic_task or PeriodicTaskStatus.NON_PERIODIC,
task_def_detail=task_def.detail,
source_type=source_type,
source_system=source_system,
source_device=source_device,
source_time=source_time,
source_ip=source_ip,
source_client_info=source_client_info
)
# 保存到数据库
session.add(task_record)
await session.flush()
# 获取任务记录的优先级
priority = task_record.priority or PeriodicTaskStatus.PERIODIC
# 提交到队列
result = await self.submit_task(task_record_id)
select_agv_type = self.check_task_def_select_agv(task_def.detail)
# 同步任务到主任务系统
try:
# 导入同步服务
from services.sync_service import create_task as tf_create_task
# 调用主任务系统创建任务接口
sync_response = await tf_create_task(
task_record_id=task_record.id,
task_name=task_record.def_label,
is_periodic=task_record.periodic_task,
priority=task_record.priority,
parent_id=task_record.parent_task_record_id if task_record.parent_task_record_id else "",
token=user_token,
map_id=map_id,
is_agv=select_agv_type
)
if sync_response and sync_response.get("success"):
logger.info(f"成功同步任务到主任务系统: {task_record_id}")
else:
logger.warning(f"同步任务到主任务系统失败: {task_record_id}")
except Exception as e:
logger.error(f"同步任务到主任务系统时发生错误: {str(e)}")
# 不影响主流程,继续执行
return {
"success": True,
"message": "任务已创建并提交到队列",
"taskRecordId": task_record_id,
"status": TaskStatus.QUEUED,
"priority": priority,
"createTime": datetime.now(),
"queueResult": result
}
except Exception as e:
logger.error(f"运行任务失败: {str(e)}")
return {
"success": False,
"message": f"运行任务失败: {str(e)}",
"taskDefId": task_def_id
}
async def cancel_task(self, task_record_id: str) -> Dict[str, Any]:
"""
取消任务
Args:
task_record_id: 任务记录ID
Returns:
Dict[str, Any]: 取消结果
"""
logger.info(f"准备取消任务: {task_record_id}")
# 检查任务记录是否存在
async with get_async_session() as session:
result = await session.execute(
select(VWEDTaskRecord).where(VWEDTaskRecord.id == task_record_id)
)
task_record = result.scalars().first()
if not task_record:
return {
"success": False,
"message": "任务记录不存在",
"taskRecordId": task_record_id
}
# 如果任务已经结束,不能取消
if task_record.status in [TaskStatus.COMPLETED, TaskStatus.FAILED, TaskStatus.CANCELED]: # 完成、失败、取消
return {
"success": False,
"message": f"任务已结束,状态码: {task_record.status}",
"taskRecordId": task_record_id,
"status": task_record.status
}
# 查找所有正在运行的子任务
child_tasks_query = await session.execute(
select(VWEDTaskRecord).where(
VWEDTaskRecord.parent_task_record_id == task_record_id,
VWEDTaskRecord.status == TaskStatus.RUNNING # 执行中状态码
)
)
child_tasks = child_tasks_query.scalars().all()
# 记录取消子任务的结果
child_tasks_results = []
# 先取消所有子任务
for child_task in child_tasks:
logger.info(f"取消子任务: {child_task.id}, 父任务: {task_record_id}")
# 递归调用,取消子任务(可能有孙子任务)
child_cancel_result = await self.cancel_task(child_task.id)
child_tasks_results.append({
"taskRecordId": child_task.id,
"success": child_cancel_result.get("success", False),
"message": child_cancel_result.get("message", "未知结果")
})
# 检查任务是否在执行中
if task_record_id in self.running_tasks:
executor = self.running_tasks[task_record_id]
result = await executor.cancel()
# 从持久化管理器中移除
await self.persistence_manager.remove_task(task_record_id)
# 添加子任务取消结果
if child_tasks_results:
result["childTasksResults"] = child_tasks_results
return result
# 如果不在执行中,直接更新状态
async with get_async_session() as session:
# 更新状态为取消
task_record.status = TaskStatus.CANCELED # 取消
task_record.ended_reason = "任务被取消"
task_record.ended_on = datetime.now()
task_record.allow_restart_same_location = True # 设置为True允许相同地址再次启动任务
await session.commit()
# 从持久化管理器中移除
await self.persistence_manager.remove_task(task_record_id)
# 更新任务定义状态为普通状态(0)
# 获取def_id
def_id = task_record.def_id
# 使用新会话更新任务定义状态,避免事务冲突
async with get_async_session() as new_session:
await new_session.execute(
update(VWEDTaskDef)
.where(VWEDTaskDef.id == def_id)
.values(status=TaskStatusEnum.PENDING)
)
await new_session.commit()
logger.info(f"更新任务定义状态为普通状态: {def_id}")
result = {
"success": True,
"message": "任务已取消",
"taskRecordId": task_record_id
}
# 添加子任务取消结果
if child_tasks_results:
result["childTasksResults"] = child_tasks_results
return result
async def get_task_status(self, task_record_id: str) -> Dict[str, Any]:
"""
获取任务状态
Args:
task_record_id: 任务记录ID
Returns:
Dict[str, Any]: 任务状态信息
"""
try:
async with get_async_session() as session:
result = await session.execute(
select(VWEDTaskRecord).where(VWEDTaskRecord.id == task_record_id)
)
task_record = result.scalars().first()
if not task_record:
return {
"success": False,
"message": "任务记录不存在",
"taskRecordId": task_record_id
}
# 获取当前队列中的任务数量
in_queue = await self._task_in_queue(task_record_id)
# 格式化任务信息
status_info = {
"taskRecordId": task_record.id,
"taskDefId": task_record.def_id,
"defLabel": task_record.def_label,
"defVersion": task_record.def_version,
"status": task_record.status,
"createdOn": task_record.created_at,
"endedOn": task_record.ended_on,
"executorTime": task_record.executor_time,
"firstExecutorTime": task_record.first_executor_time,
"endedReason": task_record.ended_reason,
"priority": task_record.priority,
"periodicTask": task_record.periodic_task,
"runningInCurrentQueue": task_record_id in self.running_tasks,
"inQueue": in_queue,
"retryCount": getattr(task_record, "retry_count", 0)
}
return {
"success": True,
"data": status_info
}
except Exception as e:
logger.error(f"获取任务状态失败: {str(e)}")
return {
"success": False,
"message": f"获取任务状态失败: {str(e)}",
"taskRecordId": task_record_id
}
async def update_periodic_task(self, task_def_id: str, enable: bool = None) -> Dict[str, Any]:
"""
更新定时任务状态
Args:
task_def_id: 任务定义ID
enable: 是否启用
Returns:
Dict[str, Any]: 更新结果
"""
return await self.periodic_task_manager.update_task(task_def_id, enable)
async def _run_periodic_task(self, task_def_id: str) -> Dict[str, Any]:
"""
运行定时任务
Args:
task_def_id: 任务定义ID
Returns:
Dict[str, Any]: 运行结果
"""
# 对于定时任务,生成系统自动任务来源信息
now = datetime.now()
source_type = SourceType.SYSTEM_SCHEDULING # 1: 系统调度
source_system = "SCHEDULER"
source_device = "SYSTEM"
source_time = now
source_ip = "127.0.0.1" # 本地调度器IP
source_client_info = json.dumps({"system": "SCHEDULER", "type": "PERIODIC"}, ensure_ascii=False)
return await self.run_task(
task_def_id=task_def_id,
source_type=source_type,
source_system=source_system,
source_device=source_device,
source_time=source_time,
source_ip=source_ip,
source_client_info=source_client_info
)
async def _worker(self, worker_id: int) -> None:
"""
工作线程
从队列中获取任务并执行
Args:
worker_id: 工作线程ID
"""
logger.info(f"工作线程 {worker_id} 启动")
while self.is_running:
try:
# 从队列获取任务
queue_index, item = await self.queue_manager.dequeue(worker_id, self.worker_manager.get_worker_count())
# 如果没有任务,继续等待
if queue_index == -1 or item is None:
await asyncio.sleep(0.1) # 短暂休眠
continue
# 解析优先级和任务ID
if isinstance(item, tuple) and len(item) == 2:
priority, task_record_id = item
# 将优先级转回正值
priority = -priority
else:
# 兼容旧格式
task_record_id = item
priority = PeriodicTaskStatus.PERIODIC
# 更新工作线程状态
self.worker_manager.update_worker_status(worker_id, {
"current_task": task_record_id,
"task_priority": priority,
"task_start_time": datetime.now()
})
logger.info(f"工作线程 {worker_id} 获取到任务: {task_record_id}, 优先级: {priority}")
# 执行任务
try:
# 创建任务执行器
executor = TaskExecutor(task_record_id)
# 设置超时时间
executor.set_timeout(self.task_timeout)
# 记录到正在执行的任务
self.running_tasks[task_record_id] = executor
# 从持久化管理器中移除(正在执行的任务不需要持久化)
await self.persistence_manager.remove_task(task_record_id)
# 创建一个取消任务检查器,定期检查数据库中任务是否被标记为取消
cancel_checker_task = asyncio.create_task(self._check_task_cancel(task_record_id, executor))
# 执行任务
result = await executor.execute()
# 取消检查器任务
cancel_checker_task.cancel()
try:
await cancel_checker_task
except asyncio.CancelledError:
pass
# 更新工作线程状态
self.worker_manager.update_worker_status(worker_id, {
"current_task": None,
"last_task": task_record_id,
"last_task_result": result.get("success", False),
"task_count": self.worker_manager.worker_status[worker_id].get("task_count", 0) + 1
})
# 移除正在执行的任务
self.running_tasks.pop(task_record_id, None)
logger.info(f"工作线程 {worker_id} 完成任务: {task_record_id}, 结果: {result.get('success')}")
except Exception as e:
logger.error(f"工作线程 {worker_id} 执行任务异常: {str(e)}")
logger.error(traceback.format_exc())
# 移除正在执行的任务
self.running_tasks.pop(task_record_id, None)
# 检查是否需要重试
await self._handle_task_error(task_record_id, str(e))
# 更新工作线程状态
self.worker_manager.update_worker_status(worker_id, {
"current_task": None,
"last_task": task_record_id,
"last_task_result": False,
"last_error": str(e),
"task_count": self.worker_manager.worker_status[worker_id].get("task_count", 0) + 1
})
# 标记任务完成
self.queue_manager.task_done(queue_index)
# 更新工作线程心跳
self.worker_manager.update_worker_heartbeat(worker_id)
except asyncio.TimeoutError:
# 超时,继续下一次循环
continue
except asyncio.CancelledError:
# 取消异常,退出循环
logger.info(f"工作线程 {worker_id} 被取消")
break
except Exception as e:
logger.error(f"工作线程 {worker_id} 异常: {str(e)}")
logger.error(traceback.format_exc())
# 更新工作线程状态
self.worker_manager.update_worker_status(worker_id, {
"error": str(e),
"error_time": datetime.now()
})
# 短暂休眠,避免频繁错误
await asyncio.sleep(1.0)
logger.info(f"工作线程 {worker_id} 结束")
async def _handle_task_error(self, task_record_id: str, error_message: str) -> None:
"""
处理任务执行错误
Args:
task_record_id: 任务记录ID
error_message: 错误信息
"""
try:
async with get_async_session() as session:
# 获取任务记录
result = await session.execute(
select(VWEDTaskRecord).where(VWEDTaskRecord.id == task_record_id)
)
task_record = result.scalars().first()
if not task_record:
logger.error(f"任务记录不存在: {task_record_id}")
return
# 获取重试次数
retry_count = getattr(task_record, "retry_count", 0) or 0
# 判断是否需要重试
if retry_count < self.max_retry_count:
# 更新重试次数
task_record.retry_count = retry_count + 1
task_record.status = TaskStatus.QUEUED # 排队中
task_record.ended_reason = f"任务执行异常,准备第{retry_count + 1}次重试: {error_message}"
# 提交更新
await session.commit()
# 计算重试延迟时间(使用指数退避算法)
retry_delay = self.retry_delay * (2 ** retry_count)
# 创建异步任务,延迟重试
asyncio.create_task(self._delayed_retry(task_record_id, retry_delay))
logger.info(f"任务 {task_record_id} 将在 {retry_delay} 秒后进行第 {retry_count + 1} 次重试")
else:
# 重试次数已达上限,更新状态为失败
task_record.status = TaskStatus.FAILED # 失败
task_record.ended_reason = f"任务执行异常,重试{retry_count}次后失败: {error_message}"
task_record.ended_on = datetime.now()
task_record.allow_restart_same_location = True # 设置为True允许相同地址再次启动任务
# 提交更新
await session.commit()
# 更新任务定义状态为普通状态(0)
# 获取def_id
def_id = task_record.def_id
# 使用新会话更新任务定义状态
async with get_async_session() as new_session:
await new_session.execute(
update(VWEDTaskDef)
.where(VWEDTaskDef.id == def_id)
.values(status=TaskStatusEnum.PENDING)
)
await new_session.commit()
logger.info(f"更新任务定义状态为普通状态: {def_id}")
logger.info(f"任务 {task_record_id} 重试{retry_count}次后失败")
except Exception as e:
logger.error(f"处理任务错误异常: {str(e)}")
logger.error(traceback.format_exc())
async def _delayed_retry(self, task_record_id: str, delay_seconds: int) -> None:
"""
延迟重试任务
Args:
task_record_id: 任务记录ID
delay_seconds: 延迟秒数
"""
try:
# 延迟等待
await asyncio.sleep(delay_seconds)
# 重新提交任务
await self.submit_task(task_record_id)
logger.info(f"任务 {task_record_id} 已重新提交")
except Exception as e:
logger.error(f"延迟重试任务异常: {str(e)}")
async def _monitor(self) -> None:
"""
监控任务
定期检查任务状态,清理僵尸任务
"""
logger.info("任务监控启动")
while self.is_running:
try:
# 检查僵尸任务
await self._check_zombie_tasks()
# 休眠一段时间
await asyncio.sleep(settings.TASK_SCHEDULER_ZOMBIE_TASK_CHECK_INTERVAL) # 每分钟检查一次
except asyncio.CancelledError:
# 取消异常,退出循环
logger.info("任务监控被取消")
break
except Exception as e:
logger.error(f"任务监控异常: {str(e)}")
# 出现异常时短暂休眠,避免频繁错误
await asyncio.sleep(5.0)
logger.info("任务监控结束")
async def _check_zombie_tasks(self) -> None:
"""
检查僵尸任务
查找长时间处于运行中但实际已中断的任务
"""
try:
async with get_async_session() as session:
# 查询长时间处于运行中状态的任务
one_hour_ago = datetime.now() - timedelta(hours=1)
query = select(VWEDTaskRecord).where(
VWEDTaskRecord.status == TaskStatus.RUNNING, # 运行中
VWEDTaskRecord.first_executor_time < one_hour_ago # 开始执行时间超过1小时
)
result = await session.execute(query)
zombie_tasks = result.scalars().all()
for task in zombie_tasks:
# 如果任务在当前运行列表中,跳过
if task.id in self.running_tasks:
continue
# 更新任务状态为失败
task.status = TaskStatus.FAILED # 失败
task.ended_reason = "任务执行超时,被系统自动终止"
task.ended_on = datetime.now()
task.allow_restart_same_location = True # 设置为True允许相同地址再次启动任务
if zombie_tasks:
# 提交更新
await session.commit()
logger.info(f"清理 {len(zombie_tasks)} 个僵尸任务")
except Exception as e:
logger.error(f"检查僵尸任务异常: {str(e)}")
async def _task_in_queue(self, task_record_id: str) -> bool:
"""
检查任务是否在队列中
Args:
task_record_id: 任务记录ID
Returns:
bool: 是否在队列中
"""
return task_record_id in self.queue_manager.priority_map
def _get_queue_size(self) -> int:
"""
获取队列大小
Returns:
int: 队列大小
"""
queue_sizes = self.queue_manager.get_queue_sizes()
return sum(queue_sizes)
def get_scheduler_status(self) -> Dict[str, Any]:
"""
获取调度器状态
Returns:
Dict[str, Any]: 调度器状态信息
"""
queue_status = self.queue_manager.get_queue_status()
worker_status = self.worker_manager.get_worker_status()
periodic_status = self.periodic_task_manager.get_task_status()
backup_status = self.persistence_manager.get_backup_status()
return {
"is_running": self.is_running,
"running_tasks": len(self.running_tasks),
"queue_status": queue_status,
"worker_status": worker_status,
"periodic_status": periodic_status,
"backup_status": backup_status,
"config": {
"worker_min_count": self.worker_min_count,
"worker_max_count": self.worker_max_count,
"queue_count": self.queue_count,
"task_timeout": self.task_timeout,
"max_retry_count": self.max_retry_count,
"retry_delay": self.retry_delay,
}
}
async def _check_task_cancel(self, task_record_id: str, executor: 'TaskExecutor') -> None:
"""
定期检查任务是否被标记为取消
Args:
task_record_id: 任务记录ID
executor: 任务执行器
"""
from data.models.taskrecord import VWEDTaskRecord
from data.enum.task_record_enum import TaskStatus
from data.session import get_async_session
from sqlalchemy import select
check_interval = 1.0 # 每秒检查一次
try:
while True:
# 等待一段时间
await asyncio.sleep(check_interval)
# 查询数据库中的任务状态
async with get_async_session() as session:
result = await session.execute(
select(VWEDTaskRecord.status).where(VWEDTaskRecord.id == task_record_id)
)
status = result.scalar_one_or_none()
# 如果任务被标记为取消,则取消执行
if status == TaskStatus.CANCELED:
logger.info(f"检测到任务 {task_record_id} 被标记为取消,正在停止执行")
await executor.cancel()
return
except Exception as e:
logger.error(f"检查任务取消状态时出错: {str(e)}")
def check_task_def_select_agv(self, detail_json: str) -> int:
"""
检查任务定义详情中是否包含SELECT_AGV块类型
Args:
detail_json: 任务定义详情JSON字符串
Returns:
int: 存在返回1不存在返回0
"""
try:
# 解析JSON字符串
if not detail_json:
return 0
detail = json.loads(detail_json)
if not detail:
return 0
# 选择AGV块类型常量
SELECT_AGV_TYPE = RobotBlockName.SELECT_AGV # "CSelectAgvBp"
# 检查根块
root_block = detail.get("rootBlock")
if not root_block:
return 0
# 递归检查块类型
def check_block(block):
# 检查当前块类型
if block.get("blockType") == SELECT_AGV_TYPE:
return True
# 递归检查子块
children = block.get("children", {})
for child_key, child_list in children.items():
if isinstance(child_list, list):
for child in child_list:
if check_block(child):
return True
return False
# 从根块开始检查
if check_block(root_block):
return 1
return 0
except json.JSONDecodeError:
logger.error(f"解析任务定义详情JSON失败: {detail_json[:100]}...")
return 0
except Exception as e:
logger.error(f"检查任务定义中SELECT_AGV类型异常: {str(e)}")
return 0
# 创建全局调度器实例
scheduler = EnhancedTaskScheduler()