#!/usr/bin/env python3
"""
S3 Backup Script
Reads S3 URLs from urls.txt and creates backup copies with .bak extension
Supports parallel processing for faster backups
"""

import os
import re
import sys
import argparse
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
import boto3
from botocore.exceptions import ClientError
from dotenv import load_dotenv
from tqdm import tqdm
import threading

# Load environment variables from .env file
load_dotenv()

# Thread-safe counter for results
class ResultCounter:
    def __init__(self):
        self.success = 0
        self.failure = 0
        self.lock = threading.Lock()
    
    def increment_success(self):
        with self.lock:
            self.success += 1
    
    def increment_failure(self):
        with self.lock:
            self.failure += 1


def parse_s3_url(url):
    """
    Parse an S3 URL and extract bucket name and object key.
    
    Supports formats:
    - https://bucket-name.s3.region.amazonaws.com/key/path
    - https://s3.region.amazonaws.com/bucket-name/key/path
    
    Returns:
        tuple: (bucket_name, object_key, region)
    """
    parsed = urlparse(url)
    
    # Format: bucket-name.s3.region.amazonaws.com
    if '.s3.' in parsed.netloc and '.amazonaws.com' in parsed.netloc:
        parts = parsed.netloc.split('.')
        bucket_name = parts[0]
        region = parts[2]  # ap-south-1
        object_key = parsed.path.lstrip('/')
        return bucket_name, object_key, region
    
    # Format: s3.region.amazonaws.com/bucket-name
    elif parsed.netloc.startswith('s3.'):
        path_parts = parsed.path.lstrip('/').split('/', 1)
        bucket_name = path_parts[0]
        object_key = path_parts[1] if len(path_parts) > 1 else ''
        region_match = re.search(r's3\.([^.]+)\.amazonaws\.com', parsed.netloc)
        region = region_match.group(1) if region_match else 'us-east-1'
        return bucket_name, object_key, region
    
    raise ValueError(f"Unable to parse S3 URL: {url}")


def backup_s3_object(url, aws_access_key, aws_secret_key):
    """
    Create a backup copy of an S3 object with .bak extension.
    
    Args:
        url: S3 URL to backup
        aws_access_key: AWS access key ID
        aws_secret_key: AWS secret access key
    
    Returns:
        dict: Result dictionary with status and details
    """
    result = {
        'url': url,
        'success': False,
        'error': None,
        'bucket': None,
        'key': None
    }
    
    try:
        # Parse URL
        bucket_name, object_key, region = parse_s3_url(url)
        result['bucket'] = bucket_name
        result['key'] = object_key
        
        # Create S3 client for the region
        s3_client = boto3.client(
            's3',
            region_name=region,
            aws_access_key_id=aws_access_key,
            aws_secret_access_key=aws_secret_key
        )
        
        backup_key = f"{object_key}.bak"
        
        # Check if source object exists
        s3_client.head_object(Bucket=bucket_name, Key=object_key)
        
        # Copy object to backup location
        copy_source = {
            'Bucket': bucket_name,
            'Key': object_key
        }
        
        s3_client.copy_object(
            CopySource=copy_source,
            Bucket=bucket_name,
            Key=backup_key
        )
        
        result['success'] = True
        
    except ClientError as e:
        error_code = e.response['Error']['Code']
        if error_code == '404':
            result['error'] = 'Object not found'
        else:
            result['error'] = str(e)
    except ValueError as e:
        result['error'] = str(e)
    except Exception as e:
        result['error'] = f"Unexpected error: {str(e)}"
    
    return result


def main():
    """Main function to process URLs and create backups."""
    
    # Parse command-line arguments
    parser = argparse.ArgumentParser(
        description='Backup S3 objects by creating copies with .bak extension',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  %(prog)s 5          # Use 5 parallel workers
  %(prog)s 10         # Use 10 parallel workers
  %(prog)s 1          # Sequential processing (no parallelism)
        """
    )
    parser.add_argument(
        'num_parallel',
        type=int,
        nargs='?',
        default=4,
        help='Number of parallel workers (default: 4)'
    )
    
    args = parser.parse_args()
    
    if args.num_parallel < 1:
        print("Error: num_parallel must be at least 1")
        sys.exit(1)
    
    # Check for AWS credentials
    aws_access_key = os.getenv('AWS_ACCESS_KEY_ID')
    aws_secret_key = os.getenv('AWS_SECRET_ACCESS_KEY')
    
    if not aws_access_key or not aws_secret_key:
        print("Error: AWS credentials not found in .env file")
        print("Please ensure AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are set")
        sys.exit(1)
    
    # Read URLs from file
    urls_file = 'urls.txt'
    if not os.path.exists(urls_file):
        print(f"Error: {urls_file} not found")
        sys.exit(1)
    
    with open(urls_file, 'r') as f:
        urls = [line.strip() for line in f if line.strip() and not line.startswith('#')]
    
    if not urls:
        print(f"No URLs found in {urls_file}")
        sys.exit(1)
    
    print(f"🚀 Starting S3 backup process")
    print(f"📋 Total URLs: {len(urls)}")
    print(f"⚡ Parallel workers: {args.num_parallel}")
    print()
    
    # Track statistics
    counter = ResultCounter()
    failed_items = []
    
    # Process URLs in parallel with progress bar
    with ThreadPoolExecutor(max_workers=args.num_parallel) as executor:
        # Submit all tasks
        future_to_url = {
            executor.submit(backup_s3_object, url, aws_access_key, aws_secret_key): url 
            for url in urls
        }
        
        # Process completed tasks with progress bar
        with tqdm(total=len(urls), desc="Backing up", unit="file", 
                  bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]') as pbar:
            
            for future in as_completed(future_to_url):
                result = future.result()
                
                if result['success']:
                    counter.increment_success()
                    pbar.set_postfix_str(f"✓ {counter.success} | ✗ {counter.failure}", refresh=True)
                else:
                    counter.increment_failure()
                    failed_items.append(result)
                    pbar.set_postfix_str(f"✓ {counter.success} | ✗ {counter.failure}", refresh=True)
                
                pbar.update(1)
    
    # Print summary
    print("\n" + "="*70)
    print("📊 BACKUP SUMMARY")
    print("="*70)
    print(f"Total URLs processed: {len(urls)}")
    print(f"✅ Successful backups: {counter.success}")
    print(f"❌ Failed backups: {counter.failure}")
    print("="*70)
    
    # Show failed items if any
    if failed_items:
        print("\n⚠️  Failed Backups:")
        print("-"*70)
        for item in failed_items:
            print(f"\nURL: {item['url']}")
            if item['bucket']:
                print(f"Bucket: {item['bucket']}")
            if item['key']:
                print(f"Key: {item['key']}")
            print(f"Error: {item['error']}")
        print("-"*70)
    
    # Exit with appropriate code
    sys.exit(0 if counter.failure == 0 else 1)


if __name__ == "__main__":
    main()
