"""
批量导入服务

提供题目数据的批量导入功能，支持：
- 外部API数据源对接
- 重复题目检测
- 断点续传
- 进度跟踪
- 数据验证和清洗
- 导入统计报告
"""

import json
import os
import time
from datetime import datetime
from typing import Dict, List, Optional, Any, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
import hashlib

from src.core.logger import get_logger
from src.core.config import Config
from src.database.models import QuestionCreateDTO, QuestionSearchFilter
from src.database.database_manager import DatabaseManager
from src.services.embedding_service import EmbeddingService
from src.services.management_service import ManagementService
from src.services.search_service import SearchService
from src.utils.helpers import validate_uuid, generate_uuid


class ImportSource:
    """导入数据源类型"""

    EXTERNAL_API = "external_api"
    JSON_FILE = "json_file"
    CSV_FILE = "csv_file"
    MANUAL = "manual"


class ImportStatus:
    """导入状态"""

    PENDING = "pending"
    RUNNING = "running"
    COMPLETED = "completed"
    FAILED = "failed"
    PAUSED = "paused"
    CANCELLED = "cancelled"


class DataFormatConverter:
    """数据格式转换器 - 将外部数据格式转换为系统内部格式"""
    
    # 题型映射：外部格式 -> 内部格式（中文）
    QUESTION_TYPE_MAPPING = {
        "single_choice": "单选",
        "single choice": "单选",
        "单选": "单选",
        "multiple_choice": "多选",
        "multiple choice": "多选",
        "多选": "多选",
        "true_false": "判断",
        "true/false": "判断",
        "判断": "判断",
        "fill_blank": "填空",
        "fill in": "填空",
        "填空": "填空",
        "short_answer": "简答",
        "brief answer": "简答",
        "简答": "简答",
        "essay": "简答",  # 可能的别名
    }
    
    # 难度映射：外部格式 -> 内部格式（中文）
    DIFFICULTY_MAPPING = {
        "easy": "简单",
        "简单": "简单",
        "medium": "中等",
        "moderate": "中等",
        "中等": "中等",
        "hard": "困难",
        "difficult": "困难",
        "困难": "困难",
    }
    
    # 状态映射：外部格式 -> 内部格式（中文）
    STATUS_MAPPING = {
        "draft": "草稿",
        "草稿": "草稿",
        "published": "已发布",
        "publish": "已发布",
        "已发布": "已发布",
        "archived": "已归档",
        "archive": "已归档",
        "已归档": "已归档",
    }
    
    @classmethod
    def convert_question_type(cls, value: Optional[str]) -> str:
        """转换题型格式"""
        if not value:
            return "单选"  # 默认值
        
        value = str(value).strip()
        # 先检查原始值（可能是中文）
        if value in cls.QUESTION_TYPE_MAPPING:
            return cls.QUESTION_TYPE_MAPPING[value]
        # 再检查小写版本（可能是英文）
        lower_value = value.lower()
        if lower_value in cls.QUESTION_TYPE_MAPPING:
            return cls.QUESTION_TYPE_MAPPING[lower_value]
        # 默认为单选
        return "单选"
    
    @classmethod
    def convert_difficulty(cls, value: Optional[str]) -> str:
        """转换难度格式"""
        if not value:
            return "中等"  # 默认值
        
        value = str(value).strip()
        # 先检查原始值（可能是中文）
        if value in cls.DIFFICULTY_MAPPING:
            return cls.DIFFICULTY_MAPPING[value]
        # 再检查小写版本（可能是英文）
        lower_value = value.lower()
        if lower_value in cls.DIFFICULTY_MAPPING:
            return cls.DIFFICULTY_MAPPING[lower_value]
        # 默认为中等
        return "中等"
    
    @classmethod
    def convert_status(cls, value: Optional[str]) -> str:
        """转换状态格式"""
        if not value:
            return "草稿"  # 默认值
        
        value = str(value).strip()
        # 先检查原始值（可能是中文）
        if value in cls.STATUS_MAPPING:
            return cls.STATUS_MAPPING[value]
        # 再检查小写版本（可能是英文）
        lower_value = value.lower()
        if lower_value in cls.STATUS_MAPPING:
            return cls.STATUS_MAPPING[lower_value]
        # 默认为草稿
        return "草稿"


class ImportService:
    """
    批量导入服务

    提供题目数据的批量导入功能，支持多种数据源和高级特性。
    """

    def __init__(
        self,
        db_manager: DatabaseManager,
        embedding_service: EmbeddingService,
        management_service: ManagementService,
        search_service: SearchService,
        config: Config,
        logger=None
    ):
        """
        初始化导入服务

        Args:
            db_manager: 数据库管理器
            embedding_service: Embedding服务
            management_service: 题目管理服务
            search_service: 题目检索服务
            config: 配置实例
            logger: 日志记录器
        """
        self.db_manager = db_manager
        self.embedding_service = embedding_service
        self.management_service = management_service
        self.search_service = search_service
        self.config = config
        self.logger = logger or get_logger()

        # 导入配置
        self.import_config = self.config.get("import", {})
        self.batch_size = self.import_config.get("batch_size", 50)
        self.max_retries = self.import_config.get("max_retries", 3)
        self.retry_delay = self.import_config.get("retry_delay", 2)

        # 重复检测配置
        self.duplicate_config = self.import_config.get("duplicate_detection", {})
        self.duplicate_enabled = self.duplicate_config.get("enabled", True)
        self.similarity_threshold = self.duplicate_config.get("similarity_threshold", 0.95)

        # 断点续传配置
        self.checkpoint_config = self.import_config.get("checkpoint", {})
        self.checkpoint_enabled = self.checkpoint_config.get("enabled", True)
        self.checkpoint_file = self.checkpoint_config.get("checkpoint_file", "./data/import_checkpoint.json")

        # 确保数据目录存在
        os.makedirs(os.path.dirname(self.checkpoint_file), exist_ok=True)

    # -------------------------------------------------------------------------
    # 导入任务管理
    # -------------------------------------------------------------------------

    def create_import_session(
        self,
        source_type: str,
        source_config: Dict[str, Any],
        options: Optional[Dict[str, Any]] = None
    ) -> str:
        """
        创建导入会话

        Args:
            source_type: 数据源类型 (EXTERNAL_API, JSON_FILE, CSV_FILE, MANUAL)
            source_config: 数据源配置
            options: 导入选项

        Returns:
            str: 导入会话ID

        Raises:
            ValueError: 参数无效
        """
        try:
            if source_type not in [ImportSource.EXTERNAL_API, ImportSource.JSON_FILE,
                                 ImportSource.CSV_FILE, ImportSource.MANUAL]:
                raise ValueError(f"不支持的数据源类型: {source_type}")

            session_id = generate_uuid()
            options = options or {}

            session = {
                "session_id": session_id,
                "source_type": source_type,
                "source_config": source_config,
                "status": ImportStatus.PENDING,
                "options": {
                    "skip_duplicates": options.get("skip_duplicates", True),
                    "validate_only": options.get("validate_only", False),
                    "replace_existing": options.get("replace_existing", False),
                    "import_answers": options.get("import_answers", True),
                    "import_explanations": options.get("import_explanations", True)
                },
                "created_at": datetime.now().isoformat(),
                "started_at": None,
                "completed_at": None,
                "progress": {
                    "total": 0,
                    "processed": 0,
                    "successful": 0,
                    "failed": 0,
                    "skipped": 0,
                    "duplicates": 0
                },
                "errors": [],
                "checkpoint": None
            }

            self._save_session(session)

            self.logger.info(f"创建导入会话: {session_id}")
            return session_id

        except Exception as e:
            self.logger.error(f"创建导入会话失败: {e}")
            raise

    def start_import(self, session_id: str) -> Dict[str, Any]:
        """
        开始导入

        Args:
            session_id: 导入会话ID

        Returns:
            Dict: 导入结果统计

        Raises:
            ValueError: 会话不存在
        """
        try:
            session = self._load_session(session_id)
            if not session:
                raise ValueError(f"导入会话不存在: {session_id}")

            if session["status"] in [ImportStatus.RUNNING]:
                raise ValueError(f"导入会话正在运行: {session_id}")

            # 更新状态
            session["status"] = ImportStatus.RUNNING
            session["started_at"] = datetime.now().isoformat()
            self._save_session(session)

            self.logger.info(f"开始导入会话: {session_id}")

            # 根据数据源类型执行导入
            if session["source_type"] == ImportSource.EXTERNAL_API:
                result = self._import_from_external_api(session)
            elif session["source_type"] == ImportSource.JSON_FILE:
                result = self._import_from_json_file(session)
            elif session["source_type"] == ImportSource.CSV_FILE:
                result = self._import_from_csv_file(session)
            elif session["source_type"] == ImportSource.MANUAL:
                result = self._import_from_manual(session)
            else:
                raise ValueError(f"未知的数据源类型: {session['source_type']}")

            # 更新最终状态
            session["status"] = ImportStatus.COMPLETED if result["failed"] == 0 else ImportStatus.FAILED
            session["completed_at"] = datetime.now().isoformat()
            session["progress"].update(result)
            self._save_session(session)

            self.logger.info(
                f"导入会话完成: {session_id}, "
                f"成功{result['successful']}, 失败{result['failed']}, "
                f"跳过{result['skipped']}, 重复{result['duplicates']}"
            )

            return result

        except Exception as e:
            self.logger.error(f"导入失败: {e}")
            # 更新状态为失败
            session = self._load_session(session_id)
            if session:
                session["status"] = ImportStatus.FAILED
                session["completed_at"] = datetime.now().isoformat()
                session["errors"].append({
                    "type": "system_error",
                    "message": str(e),
                    "timestamp": datetime.now().isoformat()
                })
                self._save_session(session)
            raise

    def pause_import(self, session_id: str) -> bool:
        """
        暂停导入

        Args:
            session_id: 导入会话ID

        Returns:
            bool: 是否成功暂停
        """
        try:
            session = self._load_session(session_id)
            if not session:
                self.logger.warning(f"导入会话不存在: {session_id}")
                return False

            if session["status"] != ImportStatus.RUNNING:
                self.logger.warning(f"导入会话未在运行: {session_id}")
                return False

            session["status"] = ImportStatus.PAUSED
            session["checkpoint"] = {
                "processed": session["progress"]["processed"],
                "timestamp": datetime.now().isoformat()
            }
            self._save_session(session)

            self.logger.info(f"已暂停导入会话: {session_id}")
            return True

        except Exception as e:
            self.logger.error(f"暂停导入失败: {e}")
            return False

    def resume_import(self, session_id: str) -> Dict[str, Any]:
        """
        恢复导入

        Args:
            session_id: 导入会话ID

        Returns:
            Dict: 导入结果统计
        """
        try:
            session = self._load_session(session_id)
            if not session:
                raise ValueError(f"导入会话不存在: {session_id}")

            if session["status"] != ImportStatus.PAUSED:
                raise ValueError(f"导入会话未暂停: {session_id}")

            session["status"] = ImportStatus.RUNNING
            self._save_session(session)

            self.logger.info(f"恢复导入会话: {session_id}")

            # 从断点继续导入
            if session["source_type"] == ImportSource.EXTERNAL_API:
                return self._import_from_external_api(session, resume=True)
            elif session["source_type"] == ImportSource.JSON_FILE:
                return self._import_from_json_file(session, resume=True)
            elif session["source_type"] == ImportSource.CSV_FILE:
                return self._import_from_csv_file(session, resume=True)
            elif session["source_type"] == ImportSource.MANUAL:
                # 手动导入不支持断点恢复，重新开始
                self.logger.warning(f"手动导入不支持断点恢复，将重新开始: {session_id}")
                return self._import_from_manual(session)
            else:
                raise ValueError(f"不支持的数据源类型: {session['source_type']}")

        except Exception as e:
            self.logger.error(f"恢复导入失败: {e}")
            raise

    def cancel_import(self, session_id: str) -> bool:
        """
        取消导入

        Args:
            session_id: 导入会话ID

        Returns:
            bool: 是否成功取消
        """
        try:
            session = self._load_session(session_id)
            if not session:
                self.logger.warning(f"导入会话不存在: {session_id}")
                return False

            session["status"] = ImportStatus.CANCELLED
            session["completed_at"] = datetime.now().isoformat()
            self._save_session(session)

            self.logger.info(f"已取消导入会话: {session_id}")
            return True

        except Exception as e:
            self.logger.error(f"取消导入失败: {e}")
            return False

    def get_import_session(self, session_id: str) -> Optional[Dict[str, Any]]:
        """
        获取导入会话信息

        Args:
            session_id: 导入会话ID

        Returns:
            Optional[Dict]: 会话信息，不存在返回None
        """
        return self._load_session(session_id)

    def list_import_sessions(self, status: Optional[str] = None) -> List[Dict[str, Any]]:
        """
        列出导入会话

        Args:
            status: 状态过滤

        Returns:
            List[Dict]: 会话列表
        """
        sessions_dir = os.path.dirname(self.checkpoint_file)
        sessions = []

        if os.path.exists(sessions_dir):
            for filename in os.listdir(sessions_dir):
                if filename.startswith("import_session_") and filename.endswith(".json"):
                    filepath = os.path.join(sessions_dir, filename)
                    try:
                        with open(filepath, 'r', encoding='utf-8') as f:
                            session = json.load(f)
                            if status is None or session.get("status") == status:
                                sessions.append(session)
                    except Exception as e:
                        self.logger.warning(f"读取会话文件失败: {filename}, {e}")

        # 按创建时间倒序排列
        sessions.sort(key=lambda x: x.get("created_at", ""), reverse=True)
        return sessions

    # -------------------------------------------------------------------------
    # 导入实现
    # -------------------------------------------------------------------------

    def _import_from_external_api(
        self,
        session: Dict[str, Any],
        resume: bool = False
    ) -> Dict[str, Any]:
        """
        从外部API导入

        Args:
            session: 会话信息
            resume: 是否从断点恢复

        Returns:
            Dict: 导入结果统计
        """
        import requests

        result = {
            "total": 0,
            "processed": 0,
            "successful": 0,
            "failed": 0,
            "skipped": 0,
            "duplicates": 0,
            "errors": []
        }

        source_config = session["source_config"]
        api_endpoint = source_config.get("endpoint")
        api_key = source_config.get("api_key") or os.getenv("EXTERNAL_API_KEY")

        if not api_endpoint:
            raise ValueError("外部API端点未配置")

        # 计算起始位置
        start_offset = 0
        if resume and session.get("checkpoint"):
            start_offset = session["checkpoint"]["processed"]

        page = start_offset // self.batch_size
        page_size = self.batch_size

        while True:
            # 检查是否暂停或取消
            current_session = self._load_session(session["session_id"])
            if current_session["status"] == ImportStatus.PAUSED:
                self.logger.info("导入已暂停")
                break
            if current_session["status"] == ImportStatus.CANCELLED:
                self.logger.info("导入已取消")
                break

            try:
                # 调用API获取数据
                headers = {"Authorization": f"Bearer {api_key}"} if api_key else {}
                params = {
                    "page": page,
                    "page_size": page_size
                }

                response = requests.get(
                    api_endpoint,
                    headers=headers,
                    params=params,
                    timeout=self.import_config.get("external_api", {}).get("timeout", 60)
                )
                response.raise_for_status()

                data = response.json()
                questions = data.get("questions", [])

                if not questions:
                    self.logger.info("没有更多数据，导入完成")
                    break

                result["total"] += len(questions)

                # 处理数据
                batch_result = self._process_import_batch(session, questions)
                result["processed"] += batch_result["processed"]
                result["successful"] += batch_result["successful"]
                result["failed"] += batch_result["failed"]
                result["skipped"] += batch_result["skipped"]
                result["duplicates"] += batch_result["duplicates"]
                result["errors"].extend(batch_result["errors"])

                # 保存断点
                if self.checkpoint_enabled:
                    self._save_checkpoint(session["session_id"], {
                        "page": page,
                        "processed": result["processed"]
                    })

                page += 1

                # 限制处理数量（防止无限循环）
                if result["processed"] >= source_config.get("max_questions", float('inf')):
                    self.logger.info("达到最大处理数量，导入完成")
                    break

            except requests.RequestException as e:
                self.logger.error(f"API请求失败: {e}")
                result["errors"].append({
                    "type": "api_error",
                    "message": str(e),
                    "page": page
                })
                # 继续下一页
                page += 1
                continue

        return result

    def _import_from_json_file(
        self,
        session: Dict[str, Any],
        resume: bool = False
    ) -> Dict[str, Any]:
        """
        从JSON文件导入

        Args:
            session: 会话信息
            resume: 是否从断点恢复

        Returns:
            Dict: 导入结果统计
        """
        source_config = session["source_config"]
        file_path = source_config.get("file_path")

        if not file_path or not os.path.exists(file_path):
            raise ValueError(f"JSON文件不存在: {file_path}")

        result = {
            "total": 0,
            "processed": 0,
            "successful": 0,
            "failed": 0,
            "skipped": 0,
            "duplicates": 0,
            "errors": []
        }

        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            questions = data.get("questions", [])
            result["total"] = len(questions)

            # 计算起始位置
            start_offset = 0
            if resume and session.get("checkpoint"):
                start_offset = session["checkpoint"]["processed"]

            # 分批处理
            for i in range(start_offset, len(questions), self.batch_size):
                batch = questions[i:i + self.batch_size]

                # 检查是否暂停或取消
                current_session = self._load_session(session["session_id"])
                if current_session["status"] == ImportStatus.PAUSED:
                    break
                if current_session["status"] == ImportStatus.CANCELLED:
                    break

                batch_result = self._process_import_batch(session, batch)
                result["processed"] += batch_result["processed"]
                result["successful"] += batch_result["successful"]
                result["failed"] += batch_result["failed"]
                result["skipped"] += batch_result["skipped"]
                result["duplicates"] += batch_result["duplicates"]
                result["errors"].extend(batch_result["errors"])

                # 保存断点
                if self.checkpoint_enabled:
                    self._save_checkpoint(session["session_id"], {
                        "processed": result["processed"]
                    })

        except json.JSONDecodeError as e:
            raise ValueError(f"JSON文件格式错误: {e}")
        except Exception as e:
            raise

        return result

    def _import_from_csv_file(
        self,
        session: Dict[str, Any],
        resume: bool = False
    ) -> Dict[str, Any]:
        """
        从CSV文件导入

        Args:
            session: 会话信息
            resume: 是否从断点恢复

        Returns:
            Dict: 导入结果统计
        """
        import csv

        source_config = session["source_config"]
        file_path = source_config.get("file_path")

        if not file_path or not os.path.exists(file_path):
            raise ValueError(f"CSV文件不存在: {file_path}")

        result = {
            "total": 0,
            "processed": 0,
            "successful": 0,
            "failed": 0,
            "skipped": 0,
            "duplicates": 0,
            "errors": []
        }

        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                questions = list(reader)

            result["total"] = len(questions)

            # 计算起始位置
            start_offset = 0
            if resume and session.get("checkpoint"):
                start_offset = session["checkpoint"]["processed"]

            # 分批处理
            for i in range(start_offset, len(questions), self.batch_size):
                batch = questions[i:i + self.batch_size]

                # 检查是否暂停或取消
                current_session = self._load_session(session["session_id"])
                if current_session["status"] == ImportStatus.PAUSED:
                    break
                if current_session["status"] == ImportStatus.CANCELLED:
                    break

                batch_result = self._process_import_batch(session, batch)
                result["processed"] += batch_result["processed"]
                result["successful"] += batch_result["successful"]
                result["failed"] += batch_result["failed"]
                result["skipped"] += batch_result["skipped"]
                result["duplicates"] += batch_result["duplicates"]
                result["errors"].extend(batch_result["errors"])

                # 保存断点
                if self.checkpoint_enabled:
                    self._save_checkpoint(session["session_id"], {
                        "processed": result["processed"]
                    })

        except Exception as e:
            raise

        return result

    def _import_from_manual(self, session: Dict[str, Any]) -> Dict[str, Any]:
        """
        手动导入（直接传入数据）

        Args:
            session: 会话信息

        Returns:
            Dict: 导入结果统计
        """
        source_config = session["source_config"]
        questions = source_config.get("questions", [])

        if not questions:
            return {
                "total": 0,
                "processed": 0,
                "successful": 0,
                "failed": 0,
                "skipped": 0,
                "duplicates": 0,
                "errors": []
            }

        return self._process_import_batch(session, questions)

    def _process_import_batch(
        self,
        session: Dict[str, Any],
        raw_questions: List[Dict[str, Any]]
    ) -> Dict[str, Any]:
        """
        处理一批导入数据

        Args:
            session: 会话信息
            raw_questions: 原始题目数据

        Returns:
            Dict: 处理结果统计
        """
        result = {
            "total": len(raw_questions),
            "processed": len(raw_questions),
            "successful": 0,
            "failed": 0,
            "skipped": 0,
            "duplicates": 0,
            "errors": []
        }

        # 数据验证和清洗
        validated_questions = []
        for idx, raw_data in enumerate(raw_questions):
            try:
                validated_data = self._validate_and_clean_data(raw_data, session["options"])
                validated_questions.append((idx, validated_data))
            except Exception as e:
                result["failed"] += 1
                result["errors"].append({
                    "index": idx,
                    "error": str(e),
                    "data": raw_data
                })
                self.logger.warning(f"数据验证失败 [{idx}]: {e}")

        # 重复检测
        if self.duplicate_enabled and session["options"].get("skip_duplicates", True):
            non_duplicate_questions = []
            for idx, question_data in validated_questions:
                is_duplicate, duplicate_info = self._check_duplicates(question_data)
                if is_duplicate:
                    result["duplicates"] += 1
                    result["skipped"] += 1
                    self.logger.info(f"检测到重复题目，已跳过: {duplicate_info}")
                else:
                    non_duplicate_questions.append((idx, question_data))
            validated_questions = non_duplicate_questions

        # 如果只是验证模式，不实际导入
        if session["options"].get("validate_only", False):
            result["successful"] = len(validated_questions)
            return result

        # 并发导入
        if validated_questions:
            question_dtos = [data for _, data in validated_questions]

            with ThreadPoolExecutor(max_workers=4) as executor:
                future_to_idx = {
                    executor.submit(
                        self._import_single_question_safe,
                        dto
                    ): idx for idx, (_, dto) in enumerate(validated_questions)
                }

                for future in as_completed(future_to_idx):
                    try:
                        question_id = future.result()
                        if question_id:
                            result["successful"] += 1
                    except Exception as e:
                        idx = future_to_idx[future]
                        result["failed"] += 1
                        result["errors"].append({
                            "index": idx,
                            "error": str(e)
                        })
                        self.logger.error(f"导入题目失败 [{idx}]: {e}")

        return result

    def _validate_and_clean_data(
        self,
        raw_data: Dict[str, Any],
        options: Dict[str, Any]
    ) -> QuestionCreateDTO:
        """
        验证和清洗数据

        Args:
            raw_data: 原始数据
            options: 导入选项

        Returns:
            QuestionCreateDTO: 清洗后的数据

        Raises:
            ValueError: 数据无效
        """
        # 提取基本字段
        content = raw_data.get("content") or raw_data.get("question") or raw_data.get("text")
        if not content or not content.strip():
            raise ValueError("题目内容不能为空")

        title = raw_data.get("title") or content[:50] + "..."

        # 处理标签
        tags = raw_data.get("tags", [])
        if isinstance(tags, str):
            tags = [t.strip() for t in tags.split(",") if t.strip()]
        if not isinstance(tags, list):
            tags = []

        # 构建DTO数据，使用转换器处理格式
        dto_data = {
            "content": content,
            "title": title,
            "question_type": DataFormatConverter.convert_question_type(raw_data.get("question_type")),
            "category": raw_data.get("category") or "未分类",
            "difficulty": DataFormatConverter.convert_difficulty(raw_data.get("difficulty")),
            "tags": tags,
            "status": DataFormatConverter.convert_status(raw_data.get("status"))
        }

        # 根据选项决定是否导入答案和解析
        if options.get("import_answers", True):
            dto_data["answer"] = raw_data.get("answer", "")
        if options.get("import_explanations", True):
            dto_data["explanation"] = raw_data.get("explanation", "")

        return QuestionCreateDTO(**dto_data)

    def _check_duplicates(
        self,
        question_data: QuestionCreateDTO
    ) -> Tuple[bool, Optional[Dict[str, Any]]]:
        """
        检查重复题目

        Args:
            question_data: 题目数据

        Returns:
            Tuple[bool, Optional[Dict]]: (是否重复, 重复信息)
        """
        try:
            # 使用语义检索查找相似题目
            search_result = self.search_service.search_by_semantic(
                query=question_data.content,
                top_k=5,
                min_similarity=self.similarity_threshold,
                include_metadata=True
            )

            # search_by_semantic 当前返回 dict，包含 results 列表
            # 为兼容性考虑，这里同时处理 dict / list 两种形式
            results: List[Dict[str, Any]] = []
            if isinstance(search_result, dict):
                results = search_result.get("results", []) or []
            elif isinstance(search_result, (list, tuple)):
                # 旧形式：直接返回结果列表
                results = list(search_result)

            if results:
                # 计算内容相似度
                best_match = results[0]
                # 优先使用相似度分数字段，其次退回搜索分数字段
                similarity = best_match.get("similarity_score")
                if similarity is None:
                    similarity = best_match.get("search_score", 0.0)

                if similarity is not None and similarity >= self.similarity_threshold:
                    return True, {
                        "matched_question_id": best_match.get("question_id"),
                        "similarity": similarity,
                        "title": best_match.get("title")
                    }

            return False, None

        except Exception as e:
            self.logger.warning(f"重复检测失败: {e}")
            # 重复检测失败时，默认不跳过
            return False, None

    def _import_single_question_safe(self, question_data: QuestionCreateDTO) -> Optional[str]:
        """
        安全地导入单个题目（用于并发）

        Args:
            question_data: 题目数据

        Returns:
            Optional[str]: 题目ID，失败返回None
        """
        try:
            return self.management_service.create_question(question_data)
        except Exception as e:
            self.logger.error(f"导入题目失败: {e}")
            raise

    # -------------------------------------------------------------------------
    # 断点续传
    # -------------------------------------------------------------------------

    def _save_checkpoint(self, session_id: str, checkpoint_data: Dict[str, Any]):
        """保存断点信息"""
        if not self.checkpoint_enabled:
            return

        try:
            checkpoints_file = self.checkpoint_file
            checkpoints = {}

            if os.path.exists(checkpoints_file):
                with open(checkpoints_file, 'r', encoding='utf-8') as f:
                    checkpoints = json.load(f)

            checkpoints[session_id] = {
                **checkpoint_data,
                "timestamp": datetime.now().isoformat()
            }

            with open(checkpoints_file, 'w', encoding='utf-8') as f:
                json.dump(checkpoints, f, ensure_ascii=False, indent=2)

        except Exception as e:
            self.logger.warning(f"保存断点失败: {e}")

    def clear_checkpoint(self, session_id: str) -> bool:
        """
        清除断点信息

        Args:
            session_id: 会话ID

        Returns:
            bool: 是否成功清除
        """
        try:
            if not os.path.exists(self.checkpoint_file):
                return True

            with open(self.checkpoint_file, 'r', encoding='utf-8') as f:
                checkpoints = json.load(f)

            if session_id in checkpoints:
                del checkpoints[session_id]

                with open(self.checkpoint_file, 'w', encoding='utf-8') as f:
                    json.dump(checkpoints, f, ensure_ascii=False, indent=2)

            return True

        except Exception as e:
            self.logger.error(f"清除断点失败: {e}")
            return False

    # -------------------------------------------------------------------------
    # 会话管理
    # -------------------------------------------------------------------------

    def _get_session_file(self, session_id: str) -> str:
        """获取会话文件路径"""
        sessions_dir = os.path.dirname(self.checkpoint_file)
        return os.path.join(sessions_dir, f"import_session_{session_id}.json")

    def _save_session(self, session: Dict[str, Any]):
        """保存会话信息"""
        try:
            session_file = self._get_session_file(session["session_id"])
            with open(session_file, 'w', encoding='utf-8') as f:
                json.dump(session, f, ensure_ascii=False, indent=2)
        except Exception as e:
            self.logger.error(f"保存会话失败: {e}")
            raise

    def _load_session(self, session_id: str) -> Optional[Dict[str, Any]]:
        """加载会话信息"""
        try:
            session_file = self._get_session_file(session_id)
            if not os.path.exists(session_file):
                return None

            with open(session_file, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            self.logger.error(f"加载会话失败: {e}")
            return None

    # -------------------------------------------------------------------------
    # 导入报告
    # -------------------------------------------------------------------------

    def generate_import_report(self, session_id: str) -> Dict[str, Any]:
        """
        生成导入报告

        Args:
            session_id: 会话ID

        Returns:
            Dict: 导入报告
        """
        session = self._load_session(session_id)
        if not session:
            raise ValueError(f"导入会话不存在: {session_id}")

        # 计算耗时
        started_at = datetime.fromisoformat(session["started_at"]) if session["started_at"] else None
        completed_at = datetime.fromisoformat(session["completed_at"]) if session["completed_at"] else None

        duration = None
        if started_at and completed_at:
            duration = (completed_at - started_at).total_seconds()

        # 计算速度
        speed = None
        if duration and duration > 0:
            speed = session["progress"]["processed"] / duration

        report = {
            "session_id": session_id,
            "source_type": session["source_type"],
            "status": session["status"],
            "created_at": session["created_at"],
            "started_at": session["started_at"],
            "completed_at": session["completed_at"],
            "duration_seconds": duration,
            "processing_speed_questions_per_second": speed,
            "statistics": session["progress"],
            "options": session["options"],
            "errors_count": len(session["errors"]),
            "has_checkpoint": session.get("checkpoint") is not None
        }

        # 添加详细错误信息（最多10条）
        if session["errors"]:
            report["errors"] = session["errors"][:10]

        return report

    def export_import_statistics(
        self,
        sessions: Optional[List[str]] = None,
        format: str = "json"
    ) -> Dict[str, Any]:
        """
        导出导入统计

        Args:
            sessions: 会话ID列表，None表示所有
            format: 导出格式

        Returns:
            Dict: 统计报告
        """
        try:
            if sessions:
                import_sessions = [self._load_session(sid) for sid in sessions]
                import_sessions = [s for s in import_sessions if s]
            else:
                import_sessions = self.list_import_sessions()

            # 计算总统计
            total_stats = {
                "total_sessions": len(import_sessions),
                "total_questions": 0,
                "total_successful": 0,
                "total_failed": 0,
                "total_skipped": 0,
                "total_duplicates": 0,
                "sessions_by_status": {}
            }

            for session in import_sessions:
                progress = session.get("progress", {})
                total_stats["total_questions"] += progress.get("total", 0)
                total_stats["total_successful"] += progress.get("successful", 0)
                total_stats["total_failed"] += progress.get("failed", 0)
                total_stats["total_skipped"] += progress.get("skipped", 0)
                total_stats["total_duplicates"] += progress.get("duplicates", 0)

                status = session.get("status", "unknown")
                total_stats["sessions_by_status"][status] = \
                    total_stats["sessions_by_status"].get(status, 0) + 1

            # 按数据源统计
            by_source = {}
            for session in import_sessions:
                source = session.get("source_type", "unknown")
                if source not in by_source:
                    by_source[source] = {
                        "count": 0,
                        "total_questions": 0,
                        "total_successful": 0
                    }

                progress = session.get("progress", {})
                by_source[source]["count"] += 1
                by_source[source]["total_questions"] += progress.get("total", 0)
                by_source[source]["total_successful"] += progress.get("successful", 0)

            report = {
                "generated_at": datetime.now().isoformat(),
                "total_statistics": total_stats,
                "by_source": by_source,
                "sessions": import_sessions
            }

            return report

        except Exception as e:
            self.logger.error(f"导出统计失败: {e}")
            raise
