#!/usr/bin/env python3
"""
MULTI-MODAL INTEGRATION
Combine vision, language, and action into unified understanding
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

torch.manual_seed(42)
np.random.seed(42)

device = torch.device('cuda')
print(f"Device: {device}\n")

class MultiModalNet(nn.Module):
    def __init__(self):
        super().__init__()
        # Vision encoder (processes images)
        self.vision_encoder = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(64, 128, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(128, 256, 3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Linear(256, 128)
        )
        
        # Language encoder (processes text/commands)
        self.language_encoder = nn.Sequential(
            nn.Linear(50, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128)
        )
        
        # Action encoder (processes motor commands)
        self.action_encoder = nn.Sequential(
            nn.Linear(20, 128),
            nn.ReLU(),
            nn.Linear(128, 128)
        )
        
        # Multi-modal fusion
        self.fusion = nn.Sequential(
            nn.Linear(128 * 3, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128)
        )
        
        # Task heads
        self.vision_language_head = nn.Sequential(
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 10)  # Visual question answering
        )
        
        self.language_action_head = nn.Sequential(
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 10)  # Language-guided action
        )
        
        self.vision_action_head = nn.Sequential(
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 10)  # Vision-guided action
        )
        
        self.unified_head = nn.Sequential(
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 10)  # All modalities combined
        )
    
    def forward(self, vision=None, language=None, action=None, task='unified'):
        # Encode available modalities
        encodings = []
        
        if vision is not None:
            vis_enc = self.vision_encoder(vision)
            encodings.append(vis_enc)
        else:
            encodings.append(torch.zeros(vision.size(0) if vision is not None else language.size(0), 128).to(device))
        
        if language is not None:
            lang_enc = self.language_encoder(language)
            encodings.append(lang_enc)
        else:
            encodings.append(torch.zeros(language.size(0) if language is not None else vision.size(0), 128).to(device))
        
        if action is not None:
            act_enc = self.action_encoder(action)
            encodings.append(act_enc)
        else:
            encodings.append(torch.zeros(action.size(0) if action is not None else vision.size(0), 128).to(device))
        
        # Fuse modalities
        fused = self.fusion(torch.cat(encodings, dim=1))
        
        # Task-specific output
        if task == 'vision_language':
            return self.vision_language_head(fused)
        elif task == 'language_action':
            return self.language_action_head(fused)
        elif task == 'vision_action':
            return self.vision_action_head(fused)
        else:
            return self.unified_head(fused)

def create_multimodal_task(task_type='unified', batch_size=64):
    """
    Multi-modal tasks:
    1. Vision + Language: "What color is the object?" → answer
    2. Language + Action: "Move forward" → action
    3. Vision + Action: See obstacle → avoid
    4. All three: "Pick up the red ball" (vision + language → action)
    """
    
    if task_type == 'vision_language':
        # Visual question answering
        images = torch.randn(batch_size, 3, 32, 32).to(device)
        
        # Questions encoded as vectors
        questions = []
        labels = []
        for i in range(batch_size):
            # Question types: color, shape, count, position
            q_type = np.random.randint(0, 10)
            question = np.zeros(50)
            question[q_type] = 1
            questions.append(question)
            labels.append(q_type)
        
        questions = torch.FloatTensor(np.array(questions)).to(device)
        labels = torch.LongTensor(labels).to(device)
        
        return images, questions, None, labels
    
    elif task_type == 'language_action':
        # Language-guided action
        commands = []
        actions = []
        labels = []
        
        for i in range(batch_size):
            cmd_type = np.random.randint(0, 10)
            # Commands: forward, back, left, right, stop, jump, crouch, etc.
            command = np.zeros(50)
            command[cmd_type] = 1
            
            # Corresponding action
            action = np.zeros(20)
            action[cmd_type] = 1
            
            commands.append(command)
            actions.append(action)
            labels.append(cmd_type)
        
        commands = torch.FloatTensor(np.array(commands)).to(device)
        actions = torch.FloatTensor(np.array(actions)).to(device)
        labels = torch.LongTensor(labels).to(device)
        
        return None, commands, actions, labels
    
    elif task_type == 'vision_action':
        # Vision-guided action (reactive)
        images = torch.randn(batch_size, 3, 32, 32).to(device)
        actions = []
        labels = []
        
        for i in range(batch_size):
            action_type = np.random.randint(0, 10)
            action = np.zeros(20)
            action[action_type] = 1
            actions.append(action)
            labels.append(action_type)
        
        actions = torch.FloatTensor(np.array(actions)).to(device)
        labels = torch.LongTensor(labels).to(device)
        
        return images, None, actions, labels
    
    else:  # unified - all three modalities
        images = torch.randn(batch_size, 3, 32, 32).to(device)
        
        commands = []
        actions = []
        labels = []
        
        for i in range(batch_size):
            task_id = np.random.randint(0, 10)
            
            command = np.zeros(50)
            command[task_id] = 1
            
            action = np.zeros(20)
            action[task_id] = 1
            
            commands.append(command)
            actions.append(action)
            labels.append(task_id)
        
        commands = torch.FloatTensor(np.array(commands)).to(device)
        actions = torch.FloatTensor(np.array(actions)).to(device)
        labels = torch.LongTensor(labels).to(device)
        
        return images, commands, actions, labels

print("="*70)
print("MULTI-MODAL INTEGRATION")
print("="*70)

model = MultiModalNet().to(device)
opt = torch.optim.Adam(model.parameters(), lr=0.001)

print("\nTraining (500 epochs)...")

for epoch in range(500):
    epoch_loss = 0
    epoch_correct = 0
    epoch_total = 0
    
    # Train on all task types
    for task_type in ['vision_language', 'language_action', 'vision_action', 'unified']:
        vis, lang, act, labels = create_multimodal_task(task_type, 64)
        
        pred = model(vis, lang, act, task=task_type)
        loss = F.cross_entropy(pred, labels)
        
        opt.zero_grad()
        loss.backward()
        opt.step()
        
        epoch_loss += loss.item()
        epoch_correct += (pred.argmax(1) == labels).sum().item()
        epoch_total += len(labels)
    
    if epoch % 100 == 0:
        acc = epoch_correct / epoch_total
        print(f"  Epoch {epoch}: Loss={epoch_loss:.3f}, Acc={acc*100:.1f}%")

print("\n✅ Training complete")

# Test each integration type
print("\n" + "="*70)
print("TESTING MULTI-MODAL INTEGRATION")
print("="*70)

task_names = {
    'vision_language': 'Vision + Language',
    'language_action': 'Language + Action',
    'vision_action': 'Vision + Action',
    'unified': 'All Modalities'
}

results = {}
for task_type in ['vision_language', 'language_action', 'vision_action', 'unified']:
    accs = []
    for _ in range(20):
        vis, lang, act, labels = create_multimodal_task(task_type, 100)
        with torch.no_grad():
            pred = model(vis, lang, act, task=task_type)
            acc = (pred.argmax(1) == labels).float().mean().item()
            accs.append(acc)
    
    avg = np.mean(accs)
    results[task_type] = avg
    status = "🎉" if avg >= 0.95 else "✅" if avg >= 0.90 else "⚠️"
    print(f"  {status} {task_names[task_type]}: {avg*100:.1f}%")

overall = np.mean(list(results.values()))
print(f"\n{'='*70}")
print(f"Overall Multi-Modal: {overall*100:.1f}%")

if overall >= 0.95:
    print("🎉 EXCEPTIONAL!")
elif overall >= 0.90:
    print("✅ EXCELLENT!")
else:
    print("✅ Strong!")

torch.save(model.state_dict(), 'multimodal.pth')
print("💾 Saved!")