diff --git a/.gitignore b/.gitignore index 54f0755..f3d3c93 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,4 @@ tests/ logs/ mcp_server_go/go.sum mcp_server_go/mcp_server_go +crontab/smartctl_log diff --git a/crontab/disk_inspection.py b/crontab/disk_inspection.py index 6f64ae9..4795958 100644 --- a/crontab/disk_inspection.py +++ b/crontab/disk_inspection.py @@ -1,15 +1,25 @@ #!/usr/bin/env python3 # -*- coding: UTF-8 -*- +# 配置项 +DEFAULT_DEVICE = "/dev/sda" # 默认检查的设备路径 +DEFAULT_CONTROLLER_COUNT = 8 # 默认控制器数量 +DEFAULT_LOG_DIR = "/root" # 默认日志目录 +MCP_SERVER_URL = "http://10.10.13.143:4527/mcp/v1/submit" # MCP服务器提交接口 +CRITICAL_ATTRIBUTES = ["Reallocated_Sector_Ct", "Spin_Retry_Count", + "End-to-End_Error", "CRC_Error_Count", + "Multi_Zone_Error_Rate"] # 关注的重要SMART属性 + import subprocess import sys import os import datetime import re import json +import requests class DiskInspection: - def __init__(self, device="/dev/sda", controller_count=8, log_dir="/root"): + def __init__(self, device=DEFAULT_DEVICE, controller_count=DEFAULT_CONTROLLER_COUNT, log_dir=DEFAULT_LOG_DIR): self.device = device self.controller_count = controller_count self.log_dir = log_dir @@ -42,68 +52,99 @@ class DiskInspection: return None def parse_smart_info(self, output, controller_id): - """解析smartctl输出的信息""" + """解析smartctl输出的信息,根据实际smartctl_log格式调整""" if not output: return {"error": "No data available"} parsed_info = {} # 提取基本信息 - device_info_match = re.search(r"Device Model:\s+(.*)", output) - if device_info_match: - parsed_info["device_model"] = device_info_match.group(1) + vendor_match = re.search(r"Vendor:\s+(.*)", output) + if vendor_match: + parsed_info["vendor"] = vendor_match.group(1) - serial_match = re.search(r"Serial Number:\s+(.*)", output) + product_match = re.search(r"Product:\s+(.*)", output) + if product_match: + parsed_info["device_model"] = product_match.group(1) + + revision_match = re.search(r"Revision:\s+(.*)", output) + if revision_match: + parsed_info["firmware_version"] = revision_match.group(1) + + serial_match = re.search(r"Serial number:\s+(.*)", output) if serial_match: parsed_info["serial_number"] = serial_match.group(1) - firmware_match = re.search(r"Firmware Version:\s+(.*)", output) - if firmware_match: - parsed_info["firmware_version"] = firmware_match.group(1) + # 提取容量信息 + capacity_match = re.search(r"User Capacity:\s+(.*?)\s+bytes", output) + if capacity_match: + parsed_info["capacity"] = capacity_match.group(1) + + # 提取旋转速率 + rotation_match = re.search(r"Rotation Rate:\s+(.*)", output) + if rotation_match: + parsed_info["rotation_rate"] = rotation_match.group(1) # 提取SMART健康状态 - health_match = re.search(r"SMART overall-health status:\s+(.*)", output) + health_match = re.search(r"SMART Health Status:\s+(.*)", output) if health_match: parsed_info["health_status"] = health_match.group(1) # 提取温度信息 - temp_match = re.search(r"Temperature_Celsius\s+([0-9]+)", output) + temp_match = re.search(r"Current Drive Temperature:\s+([0-9]+)", output) if temp_match: parsed_info["temperature"] = temp_match.group(1) - # 提取重要的SMART属性 - important_attributes = {} - attributes_section = re.search(r"ID#\s+ATTRIBUTE_NAME\s+.*\n(.*?)(\n\s*\n|$)", output, re.DOTALL) - if attributes_section: - lines = attributes_section.group(1).strip().split("\n") - for line in lines: - parts = re.split(r"\s+", line.strip()) - if len(parts) >= 10: - attr_id = parts[0] - attr_name = parts[1] - value = parts[3] - worst = parts[4] - threshold = parts[5] - - # 关注重要的属性 - critical_attrs = ["Reallocated_Sector_Ct", "Spin_Retry_Count", - "End-to-End_Error", "CRC_Error_Count", - "Multi_Zone_Error_Rate"] - - if attr_name in critical_attrs: - important_attributes[attr_name] = { - "value": value, - "worst": worst, - "threshold": threshold - } + # 提取通电时间 + power_on_match = re.search(r"Accumulated power on time, hours:minutes\s+([0-9:]+)", output) + if power_on_match: + parsed_info["power_on_time"] = power_on_match.group(1) - if important_attributes: - parsed_info["important_attributes"] = important_attributes + # 提取制造日期 + manufactured_match = re.search(r"Manufactured in week (\d+) of year (\d+)", output) + if manufactured_match: + parsed_info["manufactured_date"] = f"{manufactured_match.group(2)}-W{manufactured_match.group(1)}" + + # 提取错误计数信息 + error_read_match = re.search(r"read:\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+\.\d+)\s+(\d+)", output) + if error_read_match: + parsed_info["read_errors"] = { + "corrected": error_read_match.group(4), + "uncorrected": error_read_match.group(7), + "data_processed": f"{error_read_match.group(6)} GB" + } + + error_write_match = re.search(r"write:\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+\.\d+)\s+(\d+)", output) + if error_write_match: + parsed_info["write_errors"] = { + "corrected": error_write_match.group(4), + "uncorrected": error_write_match.group(7), + "data_processed": f"{error_write_match.group(6)} GB" + } + + # 提取SMART自检日志信息 + self_test_match = re.search(r"SMART Self-test log\nNum\s+Test\s+Status\s+segment\s+LifeTime\s+LBA_first_err \[SK ASC ASQ\]\n(.*?)(\n\s*\n|$)", output, re.DOTALL) + if self_test_match: + test_results = [] + lines = self_test_match.group(1).strip().split("\n") + for line in lines: + if line.strip() and not line.startswith(" "): + parts = re.split(r"\s+", line.strip()) + if len(parts) >= 5: + test_info = { + "id": parts[0], + "type": parts[1], + "status": parts[2], + "lifetime_hours": parts[4] + } + test_results.append(test_info) + if test_results: + parsed_info["self_test_results"] = test_results return parsed_info def generate_md_report(self): - """生成Markdown格式的报告""" + """生成Markdown格式的报告,根据实际smartctl_log格式调整""" with open(self.md_report, "w") as f: f.write(f"# 硬盘巡检报告\n\n") f.write(f"**检查日期**: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") @@ -113,54 +154,99 @@ class DiskInspection: # 添加健康状态概览 healthy_count = sum(1 for cid, info in self.results.items() if "health_status" in info and info["health_status"] == "OK") + detected_count = sum(1 for cid, info in self.results.items() + if "health_status" in info) f.write(f"### 健康状态概览\n") f.write(f"- 总控制器数: {self.controller_count}\n") + f.write(f"- 检测到的控制器数: {detected_count}\n") f.write(f"- 健康控制器数: {healthy_count}\n") - f.write(f"- 异常控制器数: {self.controller_count - healthy_count}\n\n") + f.write(f"- 异常控制器数: {detected_count - healthy_count}\n\n") - # 详细信息表格 - f.write("### 详细信息\n") - f.write("| 控制器ID | 设备型号 | 序列号 | 固件版本 | 健康状态 | 温度(°C) |\n") - f.write("|----------|----------|--------|----------|----------|----------|\n") + # 详细信息表格 - 简化版 + f.write("### 基本信息概览\n") + f.write("| 控制器ID | 厂商 | 设备型号 | 序列号 | 健康状态 | 温度(°C) | 通电时间 |\n") + f.write("|----------|------|----------|--------|----------|----------|----------|\n") for controller_id in range(self.controller_count): info = self.results.get(controller_id, {}) - f.write(f"| {controller_id} ") - f.write(f"| {info.get('device_model', 'N/A')} ") - f.write(f"| {info.get('serial_number', 'N/A')} ") - f.write(f"| {info.get('firmware_version', 'N/A')} ") - f.write(f"| {info.get('health_status', 'N/A')} ") - f.write(f"| {info.get('temperature', 'N/A')} |\n") + if "health_status" in info: # 只显示有数据的控制器 + f.write(f"| {controller_id} ") + f.write(f"| {info.get('vendor', 'N/A')} ") + f.write(f"| {info.get('device_model', 'N/A')} ") + f.write(f"| {info.get('serial_number', 'N/A')} ") + f.write(f"| {info.get('health_status', 'N/A')} ") + f.write(f"| {info.get('temperature', 'N/A')} ") + f.write(f"| {info.get('power_on_time', 'N/A')} |\n") - f.write("\n## 异常详情\n\n") + # 为每个控制器添加详细信息部分 + f.write("\n## 控制器详细信息\n\n") - # 添加异常详情 has_issues = False for controller_id in range(self.controller_count): info = self.results.get(controller_id, {}) - if "health_status" in info and info["health_status"] != "OK": - has_issues = True + if "health_status" in info: # 只显示有数据的控制器 f.write(f"### 控制器 {controller_id}\n") - f.write(f"- 健康状态: **{info['health_status']}**\n") + f.write(f"- **厂商**: {info.get('vendor', 'N/A')}\n") + f.write(f"- **设备型号**: {info.get('device_model', 'N/A')}\n") + f.write(f"- **序列号**: {info.get('serial_number', 'N/A')}\n") + f.write(f"- **固件版本**: {info.get('firmware_version', 'N/A')}\n") + f.write(f"- **容量**: {info.get('capacity', 'N/A')}\n") + f.write(f"- **旋转速率**: {info.get('rotation_rate', 'N/A')}\n") + f.write(f"- **制造日期**: {info.get('manufactured_date', 'N/A')}\n") + f.write(f"- **健康状态**: **{info.get('health_status', 'N/A')}**\n") + f.write(f"- **当前温度**: {info.get('temperature', 'N/A')}°C\n") + f.write(f"- **累计通电时间**: {info.get('power_on_time', 'N/A')}\n") + + # 错误计数信息 + if "read_errors" in info: + read_err = info["read_errors"] + f.write("- **读取错误**:\n") + f.write(f" - 已纠正: {read_err['corrected']}\n") + f.write(f" - 未纠正: {read_err['uncorrected']}\n") + f.write(f" - 处理数据量: {read_err['data_processed']}\n") + + if "write_errors" in info: + write_err = info["write_errors"] + f.write("- **写入错误**:\n") + f.write(f" - 已纠正: {write_err['corrected']}\n") + f.write(f" - 未纠正: {write_err['uncorrected']}\n") + f.write(f" - 处理数据量: {write_err['data_processed']}\n") + + # SMART自检日志 + if "self_test_results" in info: + f.write("- **SMART自检结果**:\n") + for test in info["self_test_results"]: + f.write(f" - 测试 #{test['id']}: {test['type']} - {test['status']} (运行时间: {test['lifetime_hours']}小时)\n") + + # 检查是否有异常 + if info.get("health_status", "") != "OK": + has_issues = True + f.write("\n**⚠️ 警告:此控制器状态异常,请及时关注!**\n") - if "important_attributes" in info: - f.write("- 关键属性异常:\n") - for attr_name, attr_info in info["important_attributes"].items(): - f.write(f" - {attr_name}: 值={attr_info['value']}, 最差={attr_info['worst']}, 阈值={attr_info['threshold']}\n") f.write("\n") - if not has_issues: - f.write("暂无异常情况。\n") + f.write("\n## 异常汇总\n\n") + if has_issues: + f.write("**发现异常控制器,请关注以下问题:**\n\n") + for controller_id in range(self.controller_count): + info = self.results.get(controller_id, {}) + if info.get("health_status", "") != "OK": + f.write(f"- **控制器 {controller_id}**: 状态为 {info.get('health_status', '未知')}\n") + f.write("\n") + else: + f.write("**未发现异常情况。**\n\n") f.write("\n## 建议\n\n") if has_issues: - f.write("1. 请关注异常控制器的状态变化\n") - f.write("2. 建议备份重要数据\n") - f.write("3. 必要时考虑更换有问题的硬盘\n") + f.write("1. **立即关注异常控制器**的状态变化,并考虑进行进一步测试\n") + f.write("2. **备份重要数据**,防止因硬盘故障导致数据丢失\n") + f.write("3. **联系技术支持**,评估是否需要更换有问题的硬盘\n") + f.write("4. 增加监控频率,密切关注错误计数的变化趋势\n") else: - f.write("1. 保持定期检查硬盘健康状态\n") - f.write("2. 继续做好数据备份工作\n") - f.write("3. 注意监控系统温度变化\n") + f.write("1. **保持定期检查**硬盘健康状态,建议每两周至少检查一次\n") + f.write("2. **继续做好数据备份工作**,采用异地备份和多版本策略\n") + f.write("3. **注意监控系统温度变化**,保持良好的散热环境\n") + f.write("4. 记录并跟踪硬盘的通电时间和错误计数趋势\n") print(f"Markdown报告已生成: {self.md_report}") return self.md_report @@ -172,11 +258,14 @@ class DiskInspection: with open(report_path, 'r') as f: report_content = f.read() - # 构建提交给MCP的数据 + # 构建提交给MCP的数据 - 按照服务器要求的格式,timestamp应为Unix时间戳整数 mcp_data = { - "type": "disk_inspection_report", - "timestamp": datetime.datetime.now().isoformat(), - "content": report_content + "data": { + "type": "disk_inspection_report", + "content": report_content + }, + "type": "disk_inspection", + "timestamp": int(datetime.datetime.now().timestamp()) } print("正在准备通过MCP提交报告给AI分析...") @@ -190,14 +279,10 @@ class DiskInspection: # 尝试使用requests库直接向MCP服务器发送请求 try: - import requests - # MCP服务器配置 - mcp_server_url = "http://localhost:8080/mcp/v1/submit" - - print(f"正在向MCP服务器发送请求: {mcp_server_url}") + print(f"正在向MCP服务器发送请求: {MCP_SERVER_URL}") response = requests.post( - mcp_server_url, + MCP_SERVER_URL, json=mcp_data, headers={"Content-Type": "application/json"} ) @@ -221,12 +306,12 @@ class DiskInspection: print("未安装requests库,无法直接发送HTTP请求") print("请运行: pip install requests 来安装必要的依赖") print("或者使用以下命令手动提交数据:") - print(f"curl -X POST -H 'Content-Type: application/json' -d @{temp_json} http://localhost:8080/mcp/v1/submit") + print(f"curl -X POST -H 'Content-Type: application/json' -d @{temp_json} {MCP_SERVER_URL}") except Exception as e: print(f"发送请求到MCP服务器时出错: {str(e)}") print("请检查MCP服务器是否正常运行") print("或者使用以下命令手动提交数据:") - print(f"curl -X POST -H 'Content-Type: application/json' -d @{temp_json} http://localhost:8080/mcp/v1/submit") + print(f"curl -X POST -H 'Content-Type: application/json' -d @{temp_json} {MCP_SERVER_URL}") return True except Exception as e: