#!/usr/bin/env python3
"""
Wayback Machine CLI Tool - 互联网档案时光机命令行工具
一个类似Internet Archive Wayback Machine的本地命令行工具
"""

import os
import sys
import json
import sqlite3
import hashlib
import argparse
import requests
from datetime import datetime
from urllib.parse import urlparse, urljoin
from pathlib import Path
import gzip
from bs4 import BeautifulSoup


class WaybackCLI:
    def __init__(self, data_dir="~/.wayback"):
        self.data_dir = Path(data_dir).expanduser()
        self.data_dir.mkdir(exist_ok=True)
        self.db_path = self.data_dir / "archives.db"
        self.content_dir = self.data_dir / "content"
        self.content_dir.mkdir(exist_ok=True)
        self.init_database()

    def init_database(self):
        """初始化数据库"""
        conn = sqlite3.connect(self.db_path)
        conn.execute('''
            CREATE TABLE IF NOT EXISTS archives (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                url TEXT NOT NULL,
                title TEXT,
                timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
                content_hash TEXT UNIQUE,
                file_path TEXT,
                status_code INTEGER,
                content_type TEXT,
                size INTEGER
            )
        ''')
        conn.execute('CREATE INDEX IF NOT EXISTS idx_url ON archives(url)')
        conn.execute('CREATE INDEX IF NOT EXISTS idx_timestamp ON archives(timestamp)')
        conn.commit()
        conn.close()

    def archive_url(self, url, follow_links=False, max_depth=1):
        """存档一个URL"""
        print(f"📥 正在存档: {url}")

        try:
            headers = {
                'User-Agent': 'Wayback-CLI/1.0 (Personal Archive Tool)'
            }
            response = requests.get(url, headers=headers, timeout=30)

            # 生成内容哈希
            content_hash = hashlib.sha256(response.content).hexdigest()

            # 检查是否已存在相同内容
            conn = sqlite3.connect(self.db_path)
            existing = conn.execute(
                'SELECT id FROM archives WHERE content_hash = ?',
                (content_hash,)
            ).fetchone()

            if existing:
                print(f"✓ 内容已存在 (ID: {existing[0]})")
                conn.close()
                return existing[0]

            # 保存文件
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            safe_url = urlparse(url).netloc.replace('.', '_')
            filename = f"{safe_url}_{timestamp}_{content_hash[:8]}.gz"
            file_path = self.content_dir / filename

            with gzip.open(file_path, 'wb') as f:
                f.write(response.content)

            # 提取标题
            title = ""
            if 'text/html' in response.headers.get('content-type', ''):
                try:
                    soup = BeautifulSoup(response.content, 'html.parser')
                    title_tag = soup.find('title')
                    if title_tag:
                        title = title_tag.get_text().strip()
                except:
                    pass

            # 存储到数据库
            archive_id = conn.execute('''
                INSERT INTO archives
                (url, title, content_hash, file_path, status_code, content_type, size)
                VALUES (?, ?, ?, ?, ?, ?, ?)
            ''', (
                url, title, content_hash, str(file_path),
                response.status_code,
                response.headers.get('content-type', ''),
                len(response.content)
            )).lastrowid

            conn.commit()
            conn.close()

            print(f"✓ 存档完成 (ID: {archive_id}, 大小: {len(response.content)} bytes)")

            # 如果需要跟随链接
            if follow_links and max_depth > 0 and 'text/html' in response.headers.get('content-type', ''):
                self._follow_links(url, response.content, max_depth - 1)

            return archive_id

        except Exception as e:
            print(f"❌ 存档失败: {e}")
            return None

    def _follow_links(self, base_url, content, max_depth):
        """跟随页面中的链接"""
        if max_depth <= 0:
            return

        try:
            soup = BeautifulSoup(content, 'html.parser')
            links = soup.find_all('a', href=True)
            base_domain = urlparse(base_url).netloc

            for link in links[:10]:  # 限制每页最多10个链接
                href = link['href']
                full_url = urljoin(base_url, href)

                # 只跟随同域名链接
                if urlparse(full_url).netloc == base_domain:
                    print(f"🔗 跟随链接: {full_url}")
                    self.archive_url(full_url, follow_links=False)

        except Exception as e:
            print(f"⚠️ 跟随链接时出错: {e}")

    def search(self, query, limit=20):
        """搜索存档"""
        conn = sqlite3.connect(self.db_path)
        results = conn.execute('''
            SELECT id, url, title, timestamp, size
            FROM archives
            WHERE url LIKE ? OR title LIKE ?
            ORDER BY timestamp DESC
            LIMIT ?
        ''', (f'%{query}%', f'%{query}%', limit)).fetchall()
        conn.close()

        if not results:
            print(f"🔍 未找到包含 '{query}' 的存档")
            return

        print(f"🔍 找到 {len(results)} 个结果:")
        print("-" * 80)
        for archive_id, url, title, timestamp, size in results:
            title_display = title[:50] + "..." if title and len(title) > 50 else title or ""
            size_kb = size // 1024 if size else 0
            print(f"ID: {archive_id:4d} | {timestamp} | {size_kb:4d}KB")
            print(f"URL: {url}")
            if title_display:
                print(f"标题: {title_display}")
            print("-" * 80)

    def list_archives(self, limit=20):
        """列出最近的存档"""
        conn = sqlite3.connect(self.db_path)
        results = conn.execute('''
            SELECT id, url, title, timestamp, size
            FROM archives
            ORDER BY timestamp DESC
            LIMIT ?
        ''', (limit,)).fetchall()
        conn.close()

        if not results:
            print("📝 暂无存档")
            return

        print(f"📝 最近 {len(results)} 个存档:")
        print("-" * 80)
        for archive_id, url, title, timestamp, size in results:
            title_display = title[:50] + "..." if title and len(title) > 50 else title or ""
            size_kb = size // 1024 if size else 0
            print(f"ID: {archive_id:4d} | {timestamp} | {size_kb:4d}KB")
            print(f"URL: {url}")
            if title_display:
                print(f"标题: {title_display}")
            print("-" * 80)

    def view_archive(self, archive_id):
        """查看存档内容"""
        conn = sqlite3.connect(self.db_path)
        result = conn.execute(
            'SELECT url, title, timestamp, file_path, content_type FROM archives WHERE id = ?',
            (archive_id,)
        ).fetchone()
        conn.close()

        if not result:
            print(f"❌ 未找到ID为 {archive_id} 的存档")
            return

        url, title, timestamp, file_path, content_type = result

        print(f"📄 存档信息:")
        print(f"ID: {archive_id}")
        print(f"URL: {url}")
        print(f"标题: {title or '无标题'}")
        print(f"时间: {timestamp}")
        print(f"类型: {content_type}")
        print("-" * 80)

        try:
            with gzip.open(file_path, 'rb') as f:
                content = f.read()

            if 'text' in content_type or 'html' in content_type:
                try:
                    text_content = content.decode('utf-8', errors='ignore')
                    if 'html' in content_type:
                        soup = BeautifulSoup(text_content, 'html.parser')
                        text_content = soup.get_text()

                    # 显示前1000个字符
                    preview = text_content[:1000]
                    if len(text_content) > 1000:
                        preview += "\n\n... (内容已截断，共 {} 字符)".format(len(text_content))

                    print(preview)
                except:
                    print("📄 二进制内容，无法显示为文本")
            else:
                print(f"📄 二进制文件，大小: {len(content)} bytes")

        except Exception as e:
            print(f"❌ 读取文件失败: {e}")

    def stats(self):
        """显示统计信息"""
        conn = sqlite3.connect(self.db_path)

        total_count = conn.execute('SELECT COUNT(*) FROM archives').fetchone()[0]
        total_size = conn.execute('SELECT SUM(size) FROM archives').fetchone()[0] or 0

        # 按域名统计
        domain_stats = conn.execute('''
            SELECT
                CASE
                    WHEN url LIKE 'http%' THEN
                        SUBSTR(url, INSTR(url, '//') + 2,
                               CASE WHEN INSTR(SUBSTR(url, INSTR(url, '//') + 2), '/') > 0
                                    THEN INSTR(SUBSTR(url, INSTR(url, '//') + 2), '/') - 1
                                    ELSE LENGTH(SUBSTR(url, INSTR(url, '//') + 2))
                               END)
                    ELSE 'unknown'
                END as domain,
                COUNT(*) as count
            FROM archives
            GROUP BY domain
            ORDER BY count DESC
            LIMIT 10
        ''').fetchall()

        conn.close()

        print("📊 存档统计信息:")
        print(f"总存档数: {total_count}")
        print(f"总大小: {total_size // (1024*1024):.1f} MB")
        print(f"存储位置: {self.data_dir}")

        if domain_stats:
            print("\n🌐 按域名统计 (前10):")
            for domain, count in domain_stats:
                print(f"  {domain}: {count} 个存档")


def main():
    parser = argparse.ArgumentParser(description="Wayback Machine CLI - 互联网档案时光机")
    parser.add_argument('--data-dir', default='~/.wayback', help='数据存储目录')

    subparsers = parser.add_subparsers(dest='command', help='可用命令')

    # archive 命令
    archive_parser = subparsers.add_parser('archive', help='存档URL')
    archive_parser.add_argument('url', help='要存档的URL')
    archive_parser.add_argument('--follow-links', action='store_true', help='跟随页面链接')
    archive_parser.add_argument('--max-depth', type=int, default=1, help='最大跟随深度')

    # search 命令
    search_parser = subparsers.add_parser('search', help='搜索存档')
    search_parser.add_argument('query', help='搜索关键词')
    search_parser.add_argument('--limit', type=int, default=20, help='结果数量限制')

    # list 命令
    list_parser = subparsers.add_parser('list', help='列出存档')
    list_parser.add_argument('--limit', type=int, default=20, help='显示数量限制')

    # view 命令
    view_parser = subparsers.add_parser('view', help='查看存档')
    view_parser.add_argument('id', type=int, help='存档ID')

    # stats 命令
    subparsers.add_parser('stats', help='显示统计信息')

    args = parser.parse_args()

    if not args.command:
        parser.print_help()
        return

    wayback = WaybackCLI(args.data_dir)

    if args.command == 'archive':
        wayback.archive_url(args.url, args.follow_links, args.max_depth)
    elif args.command == 'search':
        wayback.search(args.query, args.limit)
    elif args.command == 'list':
        wayback.list_archives(args.limit)
    elif args.command == 'view':
        wayback.view_archive(args.id)
    elif args.command == 'stats':
        wayback.stats()


if __name__ == '__main__':
    main()