#!/usr/bin/env python3
"""
Script to search GitHub for cloud provider repositories and save results to JSON
"""

import json
import urllib.request
import urllib.error
import time
from typing import List, Dict, Any
import urllib.parse

# Cloud providers to search for with specific search terms
PROVIDERS = {
    "AWS": ["aws", "amazon-web-services"],
    "Azure": ["azure", "microsoft-azure"],
    "Google Cloud": ["google-cloud", "gcp"],
    "Alibaba Cloud": ["alibaba-cloud", "aliyun"],
    "Huawei Cloud": ["huawei-cloud"],
    "Oracle Cloud": ["oracle-cloud", "oci"],
    "Tencent Cloud": ["tencent-cloud"],
    "IBM Cloud": ["ibm-cloud", "bluemix"],
    "DigitalOcean": ["digitalocean", "droplet"],
    "Linode": ["linode", "akamai"],
    "Vultr": ["vultr"],
    "OVHcloud": ["ovhcloud", "ovh"],
    "Rackspace": ["rackspace"],
    "Hetzner": ["hetzner"],
    "Scaleway": ["scaleway"],
    "Kubernetes": ["kubernetes", "k8s"],
    "Docker": ["docker", "container"],
    "Terraform": ["terraform", "infrastructure-as-code"],
    "Ansible": ["ansible", "automation"],
    "CloudFormation": ["cloudformation", "aws-cloudformation"],
}

OUTPUT_FILE = "/data/data/com.termux/files/home/grok/cloud_host/cloud_providers_repos.json"
GITHUB_API_URL = "https://api.github.com/search/repositories"

def search_github_repos(search_term: str, limit: int = 10, retries: int = 3) -> List[Dict[str, Any]]:
    """
    Search GitHub for repositories
    """
    repos = []

    for attempt in range(retries):
        try:
            # Build search query
            query = f"{search_term} stars:>100"
            encoded_query = urllib.parse.quote(query)
            url = f"{GITHUB_API_URL}?q={encoded_query}&sort=stars&per_page={limit}"

            req = urllib.request.Request(
                url,
                headers={"Accept": "application/vnd.github.v3+json"}
            )

            with urllib.request.urlopen(req, timeout=10) as response:
                data = json.loads(response.read().decode('utf-8'))

                if 'items' in data:
                    for item in data['items']:
                        repo_info = {
                            "name": item.get('name', ''),
                            "owner": item.get('owner', {}).get('login', ''),
                            "full_name": item.get('full_name', ''),
                            "description": item.get('description', ''),
                            "url": item.get('html_url', ''),
                            "stars": item.get('stargazers_count', 0),
                            "forks": item.get('forks_count', 0),
                            "language": item.get('language', ''),
                            "created_at": item.get('created_at', ''),
                            "updated_at": item.get('updated_at', ''),
                        }
                        repos.append(repo_info)

                return repos

        except urllib.error.HTTPError as e:
            if e.code == 403:  # Rate limit
                wait_time = 60 * (attempt + 1)
                print(f" (rate limited, waiting {wait_time}s)", end="", flush=True)
                time.sleep(wait_time)
            else:
                return repos
        except Exception as e:
            print(f" (error: {str(e)[:20]})", end="", flush=True)
            time.sleep(5)

    return repos

def main():
    """
    Main function to search all providers and save results to JSON
    """
    print(f"GitHub Cloud Provider Repository Search")
    print(f"Output file: {OUTPUT_FILE}")
    print(f"Providers to search: {len(PROVIDERS)}")
    print("")

    all_repos = {}
    total_repos = 0
    provider_list = list(PROVIDERS.items())

    for i, (provider_name, search_terms) in enumerate(provider_list):
        print(f"[{i+1:2d}/{len(PROVIDERS)}] {provider_name:<20}", end=" ", flush=True)

        provider_repos = []

        # Search with each term for this provider
        for search_term in search_terms[:2]:  # Limit to 2 search terms per provider
            repos = search_github_repos(search_term, limit=5)
            provider_repos.extend(repos)

            # Add small delay between searches
            time.sleep(1)

        # Remove duplicates by full_name
        seen = set()
        unique_repos = []
        for repo in provider_repos:
            if repo['full_name'] not in seen:
                seen.add(repo['full_name'])
                unique_repos.append(repo)

        if unique_repos:
            all_repos[provider_name] = unique_repos
            total_repos += len(unique_repos)
            print(f"✓ Found {len(unique_repos)} repos")
        else:
            print("✗ No repos found")

        # Rate limiting - wait between provider searches
        if i < len(provider_list) - 1:
            time.sleep(3)

    # Prepare output data
    output_data = {
        "search_summary": {
            "total_providers_searched": len(PROVIDERS),
            "total_repositories_found": total_repos,
            "providers_with_results": len(all_repos),
            "providers_list": list(all_repos.keys())
        },
        "repositories_by_provider": all_repos
    }

    # Save to JSON file
    try:
        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
            json.dump(output_data, f, indent=2, ensure_ascii=False)

        print("")
        print(f"✓ Search completed!")
        print(f"✓ Total repositories found: {total_repos}")
        print(f"✓ Results saved to: {OUTPUT_FILE}")

        # Show file info
        import os
        file_size = os.path.getsize(OUTPUT_FILE)
        print(f"✓ File size: {file_size / 1024:.2f} KB")
        print(f"✓ Providers with results: {len(all_repos)}/{len(PROVIDERS)}")

    except Exception as e:
        print(f"✗ Error saving results: {e}")

if __name__ == "__main__":
    main()