#!/usr/bin/env python3
"""
S3 Object Size Statistics Script
Usage: ./stats.py <input_file> <output_file> <workers>
Reads URLs from input file and compares object sizes with their backups
"""

import boto3
import os
import sys
import argparse
from urllib.parse import urlparse
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
from tqdm import tqdm


def parse_s3_url(url):
    """
    Parse S3 URL to extract bucket name and object key
    Supports both path-style and virtual-hosted-style URLs
    """
    parsed = urlparse(url)
    
    # Virtual-hosted-style: https://bucket.s3.region.amazonaws.com/key
    if '.s3.' in parsed.netloc or '.s3-' in parsed.netloc:
        bucket = parsed.netloc.split('.s3')[0]
        key = parsed.path.lstrip('/')
    # Path-style: https://s3.region.amazonaws.com/bucket/key
    elif parsed.netloc.startswith('s3'):
        path_parts = parsed.path.lstrip('/').split('/', 1)
        bucket = path_parts[0]
        key = path_parts[1] if len(path_parts) > 1 else ''
    else:
        raise ValueError(f"Unable to parse S3 URL: {url}")
    
    return bucket, key


def get_object_size(s3_client, bucket, key):
    """
    Get the size of an S3 object in bytes
    Returns None if object doesn't exist
    """
    try:
        response = s3_client.head_object(Bucket=bucket, Key=key)
        return response['ContentLength']
    except s3_client.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            return None
        else:
            raise


def format_size(size_bytes):
    """
    Format bytes into human-readable size
    """
    if size_bytes is None:
        return "Not found"
    
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if size_bytes < 1024.0:
            return f"{size_bytes:.2f}{unit}"
        size_bytes /= 1024.0
    return f"{size_bytes:.2f}PB"


def process_url(url, s3_client):
    """
    Process a single URL and return results
    Returns a dict with url, backup_size, current_size, and status
    """
    try:
        # Parse S3 URL
        bucket, key = parse_s3_url(url)
        
        # Get current object size
        current_size = get_object_size(s3_client, bucket, key)
        
        # Get backup object size (add .bak to the key)
        backup_key = f"{key}.bak"
        backup_size = get_object_size(s3_client, bucket, backup_key)
        
        # Check if we should skip
        if backup_size is None or current_size is None:
            return {
                'url': url,
                'backup_size': backup_size,
                'current_size': current_size,
                'status': 'skipped',
                'reason': 'missing_files'
            }
        
        if backup_size <= current_size:
            return {
                'url': url,
                'backup_size': backup_size,
                'current_size': current_size,
                'status': 'skipped',
                'reason': 'no_reduction'
            }
        
        # Valid result with size reduction
        return {
            'url': url,
            'backup_size': backup_size,
            'current_size': current_size,
            'status': 'processed'
        }
        
    except Exception as e:
        return {
            'url': url,
            'backup_size': None,
            'current_size': None,
            'status': 'error',
            'error': str(e)
        }



def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(
        description='Analyze S3 object sizes and compare with backups',
        usage='./stats.py <input_file> <output_file> <workers>'
    )
    parser.add_argument('input_file', help='Input file containing S3 URLs (one per line)')
    parser.add_argument('output_file', help='Output file for size statistics log')
    parser.add_argument('workers', type=int, help='Number of parallel workers')
    
    args = parser.parse_args()
    
    # Load environment variables
    load_dotenv()
    
    # Check for required credentials
    if not os.getenv('AWS_ACCESS_KEY_ID') or not os.getenv('AWS_SECRET_ACCESS_KEY'):
        print("Error: AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY must be set in .env file")
        sys.exit(1)
    
    # Check if input file exists
    if not os.path.exists(args.input_file):
        print(f"Error: {args.input_file} not found")
        sys.exit(1)
    
    # Validate workers argument
    if args.workers < 1:
        print("Error: Number of workers must be at least 1")
        sys.exit(1)
    
    # Read URLs from file
    with open(args.input_file, 'r') as f:
        urls = [line.strip() for line in f if line.strip()]
    
    if not urls:
        print(f"Error: No URLs found in {args.input_file}")
        sys.exit(1)
    
    print(f"Processing {len(urls)} URLs from {args.input_file} using {args.workers} workers...")
    
    total_backup_size = 0
    total_current_size = 0
    results = []
    skipped_count = 0
    
    # Create S3 clients for parallel processing (one per worker thread)
    # Note: boto3 clients are not thread-safe, so we'll create one per future
    
    # Process URLs in parallel with progress bar
    with ThreadPoolExecutor(max_workers=args.workers) as executor:
        # Submit all tasks
        future_to_url = {
            executor.submit(process_url, url, boto3.client(
                's3',
                aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
                aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY')
            )): url for url in urls
        }
        
        # Process results as they complete with progress bar
        with tqdm(total=len(urls), desc="Processing URLs", unit="url") as pbar:
            for future in as_completed(future_to_url):
                result = future.result()
                
                if result['status'] == 'processed':
                    results.append(result)
                    total_backup_size += result['backup_size']
                    total_current_size += result['current_size']
                else:
                    skipped_count += 1
                
                pbar.update(1)
    
    # Write results to log file
    print(f"\nWriting results to {args.output_file}...")
    print(f"Processed: {len(results)} URLs with size reduction")
    print(f"Skipped: {skipped_count} URLs (no reduction or missing files)")
    
    with open(args.output_file, 'w') as log:
        for result in results:
            log.write(f"{result['url']}\n")
            log.write(f"backup: {format_size(result['backup_size'])}\n")
            log.write(f"current: {format_size(result['current_size'])}\n")
            log.write("=" * 50 + "\n")
        
        # Write totals
        log.write("=" * 50 + "\n")
        log.write(f"total backup: {format_size(total_backup_size)}\n")
        log.write(f"total current: {format_size(total_current_size)}\n")
        
        # Calculate reduction percentage
        if total_backup_size > 0:
            reduction = ((total_backup_size - total_current_size) / total_backup_size) * 100
            log.write(f"total reduction: {reduction:.1f}%\n")
        else:
            log.write("total reduction: N/A\n")
        
        log.write("\n")
        log.write(f"Files processed: {len(results)}\n")
        log.write(f"Files skipped: {skipped_count}\n")
    
    print(f"\nCompleted! Results written to {args.output_file}")
    print(f"Files with reduction: {len(results)}")
    print(f"Total backup size: {format_size(total_backup_size)}")
    print(f"Total current size: {format_size(total_current_size)}")
    
    if total_backup_size > 0:
        reduction = ((total_backup_size - total_current_size) / total_backup_size) * 100
        print(f"Total reduction: {reduction:.1f}%")


if __name__ == "__main__":
    main()
