#!/usr/bin/env python3
"""
Create a JSON index from top30_prodver.md
Extracts cloud provider information and creates a structured JSON file
"""

import json
import re
from datetime import datetime
from typing import List, Dict, Any

INPUT_FILE = "/data/data/com.termux/files/home/grok/cloud_host/top30_prodver.md"
OUTPUT_FILE = "/data/data/com.termux/files/home/grok/cloud_host/cloud_providers_index.json"

def parse_markdown(file_path: str) -> List[Dict[str, Any]]:
    """
    Parse the markdown file and extract provider information
    """
    providers = []

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Split by numbered list items (1., 2., etc.)
        # Pattern: number. **Name** - [URL](URL) - Optional description
        pattern = r'^\d+\.\s+\*\*(.+?)\*\*\s*-\s*\[(.+?)\]\((.+?)\)(.*?)(?=^\d+\.|$)'

        matches = re.finditer(pattern, content, re.MULTILINE | re.DOTALL)

        for match in matches:
            name = match.group(1).strip()
            link_text = match.group(2).strip()
            url = match.group(3).strip()
            description = match.group(4).strip()

            # Clean up description (remove markdown formatting)
            description = description.replace('\n', ' ').strip()
            if description.startswith('-'):
                description = description[1:].strip()

            provider_info = {
                "id": len(providers) + 1,
                "name": name,
                "link_text": link_text,
                "url": url,
                "description": description if description else ""
            }

            providers.append(provider_info)

    except Exception as e:
        print(f"Error reading file: {e}")

    return providers

def categorize_providers(providers: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
    """
    Categorize providers by tier based on their position in the list
    """
    categorized = {
        "top_tier": [],
        "middle_tier": [],
        "emerging_tier": []
    }

    for provider in providers:
        if provider["id"] <= 10:
            categorized["top_tier"].append(provider)
        elif provider["id"] <= 50:
            categorized["middle_tier"].append(provider)
        else:
            categorized["emerging_tier"].append(provider)

    return categorized

def create_index(providers: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Create a structured JSON index
    """
    categorized = categorize_providers(providers)

    index = {
        "metadata": {
            "title": "Cloud Infrastructure Service Providers Index",
            "description": "Comprehensive index of cloud service providers extracted from top30_prodver.md",
            "source_file": "top30_prodver.md",
            "created_at": datetime.utcnow().isoformat() + "Z",
            "total_providers": len(providers),
            "categories": {
                "top_tier": {
                    "description": "Major global cloud service providers (Top 10)",
                    "count": len(categorized["top_tier"])
                },
                "middle_tier": {
                    "description": "Medium-sized and specialized providers (11-50)",
                    "count": len(categorized["middle_tier"])
                },
                "emerging_tier": {
                    "description": "Emerging, regional, or specialized providers (51+)",
                    "count": len(categorized["emerging_tier"])
                }
            }
        },
        "providers_by_tier": {
            "top_tier": categorized["top_tier"],
            "middle_tier": categorized["middle_tier"],
            "emerging_tier": categorized["emerging_tier"]
        },
        "providers_by_id": {str(p["id"]): p for p in providers},
        "providers_by_name": {p["name"]: p for p in providers},
        "all_providers": providers
    }

    return index

def main():
    """
    Main function
    """
    print("Creating Cloud Provider Index from top30_prodver.md")
    print(f"Input file: {INPUT_FILE}")
    print(f"Output file: {OUTPUT_FILE}")
    print("")

    # Parse markdown
    print("Parsing markdown file...", end=" ", flush=True)
    providers = parse_markdown(INPUT_FILE)
    print(f"✓ Found {len(providers)} providers")

    # Create index
    print("Creating index structure...", end=" ", flush=True)
    index = create_index(providers)
    print("✓ Done")

    # Save to JSON
    print("Saving JSON file...", end=" ", flush=True)
    try:
        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
            json.dump(index, f, indent=2, ensure_ascii=False)
        print("✓ Done")
    except Exception as e:
        print(f"✗ Error: {e}")
        return 1

    # Display results
    print("")
    print("=" * 70)
    print("INDEX CREATED SUCCESSFULLY")
    print("=" * 70)
    print("")
    print(f"File location: {OUTPUT_FILE}")

    import os
    file_size = os.path.getsize(OUTPUT_FILE)
    print(f"File size: {file_size / 1024:.2f} KB")
    print("")
    print("Summary:")
    print(f"  Total Providers: {len(providers)}")
    print(f"  Top Tier (1-10): {len(index['providers_by_tier']['top_tier'])}")
    print(f"  Middle Tier (11-50): {len(index['providers_by_tier']['middle_tier'])}")
    print(f"  Emerging Tier (51+): {len(index['providers_by_tier']['emerging_tier'])}")
    print("")

    # Show sample
    print("Sample Provider Entry:")
    print(json.dumps(providers[0], indent=2))
    print("")

    # Show categories summary
    print("Categories Summary:")
    for tier, entries in index['providers_by_tier'].items():
        if entries:
            names = [p['name'] for p in entries[:3]]
            print(f"  {tier}: {', '.join(names)}{'...' if len(entries) > 3 else ''}")

    print("")
    print("✓ Task completed successfully!")

    return 0

if __name__ == "__main__":
    exit(main())
