# PYTHON AI/ML - Artificial Intelligence and Machine Learning Reference - by Richard Rembert

# Python has become the dominant language for AI, ML, and Data Science
# This reference covers the essential libraries, frameworks, and patterns for modern AI development

# SETUP AND INSTALLATION

# Essential AI/ML packages installation
# pip install numpy pandas matplotlib seaborn scikit-learn
# pip install tensorflow keras torch torchvision
# pip install jupyter jupyterlab
# pip install openai anthropic langchain
# pip install transformers datasets accelerate
# pip install plotly streamlit gradio

# Create virtual environment for AI projects
# python -m venv ai_env
# source ai_env/bin/activate  # On Windows: ai_env\Scripts\activate
# pip install -r requirements.txt

# Jupyter notebook setup
# pip install jupyter
# jupyter notebook  # Start Jupyter
# jupyter lab       # Start JupyterLab (modern interface)

# GPU support (if available)
# pip install tensorflow-gpu  # For TensorFlow
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118  # For PyTorch with CUDA


# NUMPY - NUMERICAL COMPUTING FOUNDATION

import numpy as np

# Array creation
arr_1d = np.array([1, 2, 3, 4, 5])
arr_2d = np.array([[1, 2, 3], [4, 5, 6]])
arr_3d = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])

# Array creation functions
zeros = np.zeros((3, 4))          # 3x4 array of zeros
ones = np.ones((2, 3))            # 2x3 array of ones
identity = np.eye(3)              # 3x3 identity matrix
random_arr = np.random.random((2, 3))  # Random values 0-1
range_arr = np.arange(0, 10, 2)   # [0, 2, 4, 6, 8]
linspace = np.linspace(0, 1, 5)   # 5 equally spaced values from 0 to 1

# Array properties
print(f"Shape: {arr_2d.shape}")      # (2, 3)
print(f"Size: {arr_2d.size}")        # 6
print(f"Dimensions: {arr_2d.ndim}")  # 2
print(f"Data type: {arr_2d.dtype}")  # int64

# Array operations
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])

# Element-wise operations
addition = a + b        # [5, 7, 9]
subtraction = a - b     # [-3, -3, -3]
multiplication = a * b  # [4, 10, 18]
division = a / b        # [0.25, 0.4, 0.5]
power = a ** 2          # [1, 4, 9]

# Mathematical functions
sqrt_a = np.sqrt(a)     # Square root
exp_a = np.exp(a)       # Exponential
log_a = np.log(a)       # Natural logarithm
sin_a = np.sin(a)       # Sine

# Array indexing and slicing
matrix = np.array([[1, 2, 3, 4],
                   [5, 6, 7, 8],
                   [9, 10, 11, 12]])

# Basic indexing
first_row = matrix[0]           # [1, 2, 3, 4]
first_element = matrix[0, 0]    # 1
last_element = matrix[-1, -1]   # 12

# Slicing
submatrix = matrix[0:2, 1:3]    # [[2, 3], [6, 7]]
every_other = matrix[::2, ::2]  # [[1, 3], [9, 11]]

# Boolean indexing
mask = matrix > 5
filtered = matrix[mask]         # [6, 7, 8, 9, 10, 11, 12]

# Array reshaping and manipulation
reshaped = matrix.reshape(4, 3)  # Reshape to 4x3
flattened = matrix.flatten()     # 1D array
transposed = matrix.T            # Transpose

# Statistical operations
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
mean = np.mean(data)            # 5.5
median = np.median(data)        # 5.5
std = np.std(data)              # Standard deviation
var = np.var(data)              # Variance
min_val = np.min(data)          # 1
max_val = np.max(data)          # 10
sum_val = np.sum(data)          # 55

# Linear algebra operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])

# Matrix multiplication
dot_product = np.dot(A, B)      # Matrix multiplication
matmul = A @ B                  # Alternative syntax (Python 3.5+)

# Matrix operations
determinant = np.linalg.det(A)  # Determinant
inverse = np.linalg.inv(A)      # Inverse
eigenvals, eigenvecs = np.linalg.eig(A)  # Eigenvalues and eigenvectors


# PANDAS - DATA MANIPULATION AND ANALYSIS

import pandas as pd

# Series creation
series = pd.Series([1, 2, 3, 4, 5])
named_series = pd.Series([1, 2, 3], index=['a', 'b', 'c'])

# DataFrame creation
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'Age': [25, 30, 35, 28],
    'City': ['New York', 'London', 'Tokyo', 'Paris'],
    'Salary': [50000, 60000, 70000, 55000]
})

# Reading data from files
# df = pd.read_csv('data.csv')
# df = pd.read_excel('data.xlsx')
# df = pd.read_json('data.json')
# df = pd.read_sql('SELECT * FROM table', connection)

# DataFrame information
print(df.info())                # Data types and null counts
print(df.describe())            # Statistical summary
print(df.head())                # First 5 rows
print(df.tail())                # Last 5 rows
print(df.shape)                 # (rows, columns)
print(df.columns)               # Column names
print(df.dtypes)                # Data types

# Data selection and indexing
names = df['Name']              # Select column
subset = df[['Name', 'Age']]    # Select multiple columns
first_row = df.iloc[0]          # Select by position
alice_data = df.loc[df['Name'] == 'Alice']  # Select by condition

# Filtering data
high_earners = df[df['Salary'] > 55000]
young_people = df[df['Age'] < 30]
complex_filter = df[(df['Age'] > 25) & (df['Salary'] > 50000)]

# Data manipulation
df['Salary_K'] = df['Salary'] / 1000  # Create new column
df['Age_Group'] = df['Age'].apply(lambda x: 'Young' if x < 30 else 'Adult')

# Sorting
sorted_by_age = df.sort_values('Age')
sorted_multi = df.sort_values(['City', 'Age'], ascending=[True, False])

# Grouping and aggregation
age_stats = df.groupby('City')['Age'].mean()
salary_stats = df.groupby('City').agg({
    'Salary': ['mean', 'min', 'max'],
    'Age': 'mean'
})

# Handling missing data
# df.isnull()                   # Check for null values
# df.dropna()                   # Remove rows with null values
# df.fillna(0)                  # Fill null values with 0
# df.fillna(df.mean())          # Fill with mean values

# Data transformation
# One-hot encoding
city_encoded = pd.get_dummies(df['City'])

# Date handling
dates = pd.date_range('2024-01-01', periods=10, freq='D')
date_df = pd.DataFrame({
    'date': dates,
    'value': np.random.randn(10)
})
date_df['year'] = date_df['date'].dt.year
date_df['month'] = date_df['date'].dt.month
date_df['weekday'] = date_df['date'].dt.day_name()

# Merging and joining DataFrames
df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})

inner_join = pd.merge(df1, df2, on='key', how='inner')  # Inner join
left_join = pd.merge(df1, df2, on='key', how='left')    # Left join
outer_join = pd.merge(df1, df2, on='key', how='outer')  # Outer join


# MATPLOTLIB - DATA VISUALIZATION

import matplotlib.pyplot as plt

# Basic plotting
x = np.linspace(0, 10, 100)
y = np.sin(x)

plt.figure(figsize=(10, 6))
plt.plot(x, y, label='sin(x)')
plt.xlabel('X values')
plt.ylabel('Y values')
plt.title('Sine Wave')
plt.legend()
plt.grid(True)
plt.show()

# Multiple plots
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Line plot
axes[0, 0].plot(x, y)
axes[0, 0].set_title('Line Plot')

# Scatter plot
axes[0, 1].scatter(df['Age'], df['Salary'])
axes[0, 1].set_title('Age vs Salary')
axes[0, 1].set_xlabel('Age')
axes[0, 1].set_ylabel('Salary')

# Histogram
axes[1, 0].hist(df['Age'], bins=10, alpha=0.7)
axes[1, 0].set_title('Age Distribution')

# Bar plot
city_counts = df['City'].value_counts()
axes[1, 1].bar(city_counts.index, city_counts.values)
axes[1, 1].set_title('City Distribution')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Advanced plotting features
plt.figure(figsize=(10, 6))

# Multiple series
plt.plot(x, np.sin(x), label='sin(x)', linewidth=2)
plt.plot(x, np.cos(x), label='cos(x)', linewidth=2, linestyle='--')
plt.plot(x, np.tan(x), label='tan(x)', linewidth=2, linestyle=':', alpha=0.7)

plt.xlim(0, 2*np.pi)
plt.ylim(-2, 2)
plt.xlabel('X values')
plt.ylabel('Y values')
plt.title('Trigonometric Functions')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()


# SEABORN - STATISTICAL VISUALIZATION

import seaborn as sns

# Set style
sns.set_style("whitegrid")
sns.set_palette("husl")

# Create sample dataset
np.random.seed(42)
data = pd.DataFrame({
    'x': np.random.randn(100),
    'y': np.random.randn(100),
    'category': np.random.choice(['A', 'B', 'C'], 100),
    'value': np.random.randint(1, 100, 100)
})

# Correlation heatmap
correlation_matrix = df.select_dtypes(include=[np.number]).corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.show()

# Distribution plots
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Histogram with KDE
sns.histplot(data['x'], kde=True, ax=axes[0, 0])
axes[0, 0].set_title('Distribution with KDE')

# Box plot
sns.boxplot(x='category', y='value', data=data, ax=axes[0, 1])
axes[0, 1].set_title('Box Plot by Category')

# Violin plot
sns.violinplot(x='category', y='value', data=data, ax=axes[1, 0])
axes[1, 0].set_title('Violin Plot by Category')

# Scatter plot with regression
sns.scatterplot(x='x', y='y', hue='category', data=data, ax=axes[1, 1])
sns.regplot(x='x', y='y', data=data, scatter=False, ax=axes[1, 1])
axes[1, 1].set_title('Scatter Plot with Regression')

plt.tight_layout()
plt.show()

# Pair plot for exploring relationships
# sns.pairplot(df, hue='City')
# plt.show()


# SCIKIT-LEARN - MACHINE LEARNING

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix
from sklearn.datasets import make_classification, make_regression, load_iris, load_boston

# Data preprocessing
# Load sample dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Classification Example
# Train multiple models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(random_state=42)
}

results = {}
for name, model in models.items():
    # Train model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    results[name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    
    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

# Cross-validation
for name, model in models.items():
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    print(f"\n{name} CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

# Regression Example
# Generate regression dataset
X_reg, y_reg = make_regression(n_samples=1000, n_features=10, noise=0.1, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Scale features
scaler_reg = StandardScaler()
X_train_reg_scaled = scaler_reg.fit_transform(X_train_reg)
X_test_reg_scaled = scaler_reg.transform(X_test_reg)

# Train regression models
reg_models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

for name, model in reg_models.items():
    model.fit(X_train_reg_scaled, y_train_reg)
    y_pred_reg = model.predict(X_test_reg_scaled)
    
    mse = mean_squared_error(y_test_reg, y_pred_reg)
    r2 = r2_score(y_test_reg, y_pred_reg)
    
    print(f"\n{name} Regression Results:")
    print(f"MSE: {mse:.4f}")
    print(f"R²: {r2:.4f}")

# Clustering Example
# K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
cluster_labels = kmeans.fit_predict(X_train_scaled)

# Visualize clusters (if 2D)
if X_train_scaled.shape[1] >= 2:
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(X_train_scaled[:, 0], X_train_scaled[:, 1], c=cluster_labels, cmap='viridis')
    plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], 
                c='red', marker='x', s=200, linewidths=3, label='Centroids')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title('K-Means Clustering')
    plt.legend()
    plt.colorbar(scatter)
    plt.show()


# DEEP LEARNING WITH TENSORFLOW/KERAS

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.datasets import mnist, cifar10
from tensorflow.keras.utils import to_categorical

# Check TensorFlow version and GPU availability
print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")

# Neural Network for MNIST digit classification
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Preprocessing
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

# Build model
model = keras.Sequential([
    layers.Flatten(input_shape=(28, 28)),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(10, activation='softmax')
])

# Compile model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Model summary
model.summary()

# Train model
history = model.fit(x_train, y_train,
                   batch_size=128,
                   epochs=10,
                   validation_split=0.1,
                   verbose=1)

# Evaluate model
test_loss, test_accuracy = model.evaluate(x_test, y_test, verbose=0)
print(f"Test accuracy: {test_accuracy:.4f}")

# Convolutional Neural Network for image classification
def create_cnn_model(input_shape, num_classes):
    model = keras.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])
    return model

# For CIFAR-10 dataset
# (x_train_cifar, y_train_cifar), (x_test_cifar, y_test_cifar) = cifar10.load_data()
# cnn_model = create_cnn_model((32, 32, 3), 10)

# Transfer Learning Example
base_model = keras.applications.VGG16(
    weights='imagenet',
    include_top=False,
    input_shape=(224, 224, 3)
)

# Freeze base model
base_model.trainable = False

# Add custom top layers
transfer_model = keras.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(10, activation='softmax')  # Adjust for your number of classes
])

transfer_model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Custom training loop example
@tf.function
def train_step(x, y, model, optimizer, loss_fn):
    with tf.GradientTape() as tape:
        predictions = model(x, training=True)
        loss = loss_fn(y, predictions)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    return loss

# Plot training history
def plot_training_history(history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    # Plot training & validation accuracy
    ax1.plot(history.history['accuracy'], label='Training Accuracy')
    ax1.plot(history.history['val_accuracy'], label='Validation Accuracy')
    ax1.set_title('Model Accuracy')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Accuracy')
    ax1.legend()
    
    # Plot training & validation loss
    ax2.plot(history.history['loss'], label='Training Loss')
    ax2.plot(history.history['val_loss'], label='Validation Loss')
    ax2.set_title('Model Loss')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Loss')
    ax2.legend()
    
    plt.tight_layout()
    plt.show()

# plot_training_history(history)


# PYTORCH - ALTERNATIVE DEEP LEARNING FRAMEWORK

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

# Check PyTorch setup
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

# Define a simple neural network
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Create model instance
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpleNN(input_size=784, hidden_size=128, num_classes=10).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert data to PyTorch tensors
X_tensor = torch.FloatTensor(x_train.reshape(-1, 784))
y_tensor = torch.LongTensor(np.argmax(y_train, axis=1))

# Create data loader
dataset = TensorDataset(X_tensor, y_tensor)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Training loop
def train_pytorch_model(model, dataloader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch_idx, (data, target) in enumerate(dataloader):
            data, target = data.to(device), target.to(device)
            
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
        avg_loss = total_loss / len(dataloader)
        print(f'Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}')

# train_pytorch_model(model, dataloader, criterion, optimizer)


# HUGGING FACE TRANSFORMERS - PRE-TRAINED MODELS

from transformers import (
    AutoTokenizer, AutoModel, AutoModelForSequenceClassification,
    pipeline, GPT2LMHeadModel, GPT2Tokenizer, BertTokenizer, BertModel
)

# Text classification pipeline
classifier = pipeline("sentiment-analysis")
results = classifier(["I love this product!", "This is terrible.", "It's okay."])
for result in results:
    print(f"Text: {result}")

# Named Entity Recognition
ner = pipeline("ner", aggregation_strategy="simple")
text = "Apple Inc. was founded by Steve Jobs in Cupertino, California."
entities = ner(text)
for entity in entities:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Score: {entity['score']:.4f}")

# Text Generation
generator = pipeline("text-generation", model="gpt2")
prompts = ["The future of artificial intelligence is"]
generated = generator(prompts, max_length=50, num_return_sequences=2)
for gen in generated:
    print(f"Generated: {gen['generated_text']}")

# Question Answering
qa_pipeline = pipeline("question-answering")
context = """
Machine learning is a subset of artificial intelligence that focuses on algorithms 
that can learn and make decisions from data without being explicitly programmed.
"""
question = "What is machine learning?"
answer = qa_pipeline(question=question, context=context)
print(f"Answer: {answer['answer']}, Score: {answer['score']:.4f}")

# Working with BERT for embeddings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(texts):
    encoded = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    
    with torch.no_grad():
        outputs = model(**encoded)
        # Use the [CLS] token embedding as sentence representation
        embeddings = outputs.last_hidden_state[:, 0, :].numpy()
    
    return embeddings

texts = ["I love machine learning", "Natural language processing is fascinating"]
embeddings = get_bert_embeddings(texts)
print(f"Embedding shape: {embeddings.shape}")

# Fine-tuning example (simplified)
def fine_tune_classifier(train_texts, train_labels, model_name="bert-base-uncased"):
    from transformers import Trainer, TrainingArguments
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    
    # Tokenize training data
    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
    
    class Dataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.labels)
    
    train_dataset = Dataset(train_encodings, train_labels)
    
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
    )
    
    trainer.train()
    return model, tokenizer


# LLM INTEGRATION - OPENAI AND ANTHROPIC APIs

# OpenAI API integration
import openai
import os

# Set up OpenAI API key
# openai.api_key = os.getenv("OPENAI_API_KEY")

def chat_with_gpt(messages, model="gpt-3.5-turbo", temperature=0.7):
    """
    Chat with OpenAI's GPT models
    """
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=messages,
            temperature=temperature,
            max_tokens=1000
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error calling OpenAI API: {e}")
        return None

# Example usage
# messages = [
#     {"role": "system", "content": "You are a helpful assistant."},
#     {"role": "user", "content": "Explain machine learning in simple terms."}
# ]
# response = chat_with_gpt(messages)
# print(response)

# Anthropic Claude API integration
import anthropic

# Set up Anthropic API key
# anthropic_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

def chat_with_claude(message, model="claude-3-sonnet-20240229", max_tokens=1000):
    """
    Chat with Anthropic's Claude models
    """
    try:
        response = anthropic_client.messages.create(
            model=model,
            max_tokens=max_tokens,
            messages=[{"role": "user", "content": message}]
        )
        return response.content[0].text
    except Exception as e:
        print(f"Error calling Anthropic API: {e}")
        return None

# Example usage
# response = chat_with_claude("Explain the difference between supervised and unsupervised learning.")
# print(response)

# Function calling with OpenAI
def get_weather(location):
    """Mock function to get weather data"""
    return f"The weather in {location} is sunny with 75°F"

def function_calling_example():
    """
    Example of using OpenAI function calling
    """
    functions = [
        {
            "name": "get_weather",
            "description": "Get the current weather in a given location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g. San Francisco, CA"
                    }
                },
                "required": ["location"]
            }
        }
    ]
    
    messages = [
        {"role": "user", "content": "What's the weather like in New York?"}
    ]
    
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=messages,
            functions=functions,
            function_call="auto"
        )
        
        if response.choices[0].message.get("function_call"):
            function_name = response.choices[0].message["function_call"]["name"]
            function_args = eval(response.choices[0].message["function_call"]["arguments"])
            
            if function_name == "get_weather":
                result = get_weather(function_args["location"])
                return result
                
    except Exception as e:
        print(f"Error with function calling: {e}")
        return None


# LANGCHAIN - LLM APPLICATION FRAMEWORK

from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain.chains import LLMChain, SimpleSequentialChain
from langchain.memory import ConversationBufferMemory
from langchain.agents import initialize_agent, Tool
from langchain.document_loaders import TextLoader, PDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

# Basic LLM setup
# llm = OpenAI(temperature=0.7, openai_api_key=os.getenv("OPENAI_API_KEY"))
# chat_model = ChatOpenAI(temperature=0.7, openai_api_key=os.getenv("OPENAI_API_KEY"))

# Prompt templates
prompt_template = PromptTemplate(
    input_variables=["topic"],
    template="Write a brief explanation about {topic} for beginners."
)

# LLM Chain
# chain = LLMChain(llm=llm, prompt=prompt_template)
# result = chain.run("machine learning")

# Chat prompt template
chat_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful AI assistant specializing in {domain}."),
    ("human", "{question}")
])

# Memory for conversations
memory = ConversationBufferMemory()

# Sequential chains
def create_sequential_chain():
    """
    Create a sequential chain for complex tasks
    """
    # First chain: Generate a topic outline
    outline_prompt = PromptTemplate(
        input_variables=["topic"],
        template="Create an outline for a tutorial about {topic}."
    )
    # outline_chain = LLMChain(llm=llm, prompt=outline_prompt)
    
    # Second chain: Write content based on outline
    content_prompt = PromptTemplate(
        input_variables=["outline"],
        template="Write detailed content based on this outline:\n{outline}"
    )
    # content_chain = LLMChain(llm=llm, prompt=content_prompt)
    
    # Combine chains
    # sequential_chain = SimpleSequentialChain(chains=[outline_chain, content_chain])
    # return sequential_chain

# Document processing and Q&A
def create_document_qa_system(file_path):
    """
    Create a question-answering system for documents
    """
    # Load documents
    loader = TextLoader(file_path)
    documents = loader.load()
    
    # Split documents
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = text_splitter.split_documents(documents)
    
    # Create embeddings and vector store
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(texts, embeddings)
    
    # Create Q&A chain
    from langchain.chains import RetrievalQA
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever()
    )
    
    return qa

# Custom tools for agents
def calculator_tool(expression):
    """Simple calculator tool"""
    try:
        return str(eval(expression))
    except:
        return "Invalid expression"

def search_tool(query):
    """Mock search tool"""
    return f"Search results for: {query}"

# Create agent with tools
def create_agent():
    """
    Create an agent with custom tools
    """
    tools = [
        Tool(
            name="Calculator",
            func=calculator_tool,
            description="Useful for mathematical calculations"
        ),
        Tool(
            name="Search",
            func=search_tool,
            description="Useful for searching information"
        )
    ]
    
    # agent = initialize_agent(
    #     tools,
    #     llm,
    #     agent="zero-shot-react-description",
    #     verbose=True
    # )
    # return agent


# COMPUTER VISION

import cv2
from PIL import Image
import face_recognition

# OpenCV basics
def process_image_opencv(image_path):
    """
    Basic image processing with OpenCV
    """
    # Read image
    img = cv2.imread(image_path)
    
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Apply Gaussian blur
    blurred = cv2.GaussianBlur(gray, (15, 15), 0)
    
    # Edge detection
    edges = cv2.Canny(blurred, 50, 150)
    
    # Find contours
    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # Draw contours
    result = img.copy()
    cv2.drawContours(result, contours, -1, (0, 255, 0), 2)
    
    return result, gray, edges

# Face recognition
def detect_faces(image_path):
    """
    Detect and recognize faces in an image
    """
    # Load image
    image = face_recognition.load_image_file(image_path)
    
    # Find face locations
    face_locations = face_recognition.face_locations(image)
    face_encodings = face_recognition.face_encodings(image, face_locations)
    
    print(f"Found {len(face_locations)} face(s) in the image")
    
    return face_locations, face_encodings

# Object detection with YOLO (requires additional setup)
def setup_yolo_detection():
    """
    Setup YOLO for object detection
    Note: Requires downloading YOLO weights and config files
    """
    # Load YOLO
    net = cv2.dnn.readNet("yolov3.weights", "yolov3.cfg")
    
    # Load class names
    with open("coco.names", "r") as f:
        classes = [line.strip() for line in f.readlines()]
    
    return net, classes

def detect_objects(image_path, net, classes):
    """
    Detect objects in an image using YOLO
    """
    # Load image
    image = cv2.imread(image_path)
    height, width, channels = image.shape
    
    # Prepare image for YOLO
    blob = cv2.dnn.blobFromImage(image, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
    net.setInput(blob)
    outs = net.forward()
    
    # Process detections
    class_ids = []
    confidences = []
    boxes = []
    
    for out in outs:
        for detection in out:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            
            if confidence > 0.5:
                # Object detected
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)
                
                # Rectangle coordinates
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)
                
                boxes.append([x, y, w, h])
                confidences.append(float(confidence))
                class_ids.append(class_id)
    
    return boxes, confidences, class_ids


# NATURAL LANGUAGE PROCESSING

import nltk
import spacy
from textblob import TextBlob
from wordcloud import WordCloud

# Download required NLTK data
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('vader_lexicon')

# Load spaCy model
# nlp = spacy.load("en_core_web_sm")

# Basic text preprocessing
def preprocess_text(text):
    """
    Basic text preprocessing pipeline
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation and special characters
    import re
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenization
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming
    from nltk.stem import PorterStemmer
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    return tokens

# Sentiment analysis
def analyze_sentiment(text):
    """
    Analyze sentiment using TextBlob and NLTK
    """
    # TextBlob sentiment
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    
    # NLTK VADER sentiment
    from nltk.sentiment import SentimentIntensityAnalyzer
    sia = SentimentIntensityAnalyzer()
    vader_scores = sia.polarity_scores(text)
    
    return {
        'textblob_polarity': polarity,
        'textblob_subjectivity': subjectivity,
        'vader_scores': vader_scores
    }

# Named Entity Recognition with spaCy
def extract_entities(text):
    """
    Extract named entities using spaCy
    """
    doc = nlp(text)
    entities = [(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc.ents]
    return entities

# Topic modeling with LDA
def topic_modeling(documents, num_topics=5):
    """
    Perform topic modeling using LDA
    """
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.decomposition import LatentDirichletAllocation
    
    # Vectorize documents
    vectorizer = CountVectorizer(max_features=1000, stop_words='english')
    doc_term_matrix = vectorizer.fit_transform(documents)
    
    # Fit LDA model
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(doc_term_matrix)
    
    # Get feature names
    feature_names = vectorizer.get_feature_names_out()
    
    # Display topics
    topics = []
    for topic_idx, topic in enumerate(lda.components_):
        top_words = [feature_names[i] for i in topic.argsort()[-10:]]
        topics.append(top_words)
    
    return topics, lda, vectorizer

# Word cloud generation
def create_wordcloud(text, width=800, height=400):
    """
    Create a word cloud from text
    """
    wordcloud = WordCloud(
        width=width,
        height=height,
        background_color='white',
        max_words=100,
        colormap='viridis'
    ).generate(text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()
    
    return wordcloud


# WEB APPLICATIONS WITH STREAMLIT

import streamlit as st
import plotly.express as px
import plotly.graph_objects as go

# Basic Streamlit app structure
def create_ml_app():
    """
    Create a machine learning web app with Streamlit
    """
    st.title("Machine Learning Dashboard")
    st.sidebar.title("Navigation")
    
    # Sidebar options
    page = st.sidebar.selectbox("Choose a page", 
                               ["Data Upload", "EDA", "Model Training", "Predictions"])
    
    if page == "Data Upload":
        st.header("Data Upload")
        uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
        
        if uploaded_file is not None:
            df = pd.read_csv(uploaded_file)
            st.write("Data Preview:")
            st.dataframe(df.head())
            
            # Store in session state
            st.session_state['data'] = df
    
    elif page == "EDA":
        st.header("Exploratory Data Analysis")
        
        if 'data' in st.session_state:
            df = st.session_state['data']
            
            # Basic statistics
            st.subheader("Dataset Info")
            st.write(f"Shape: {df.shape}")
            st.write("Statistical Summary:")
            st.dataframe(df.describe())
            
            # Visualizations
            st.subheader("Visualizations")
            
            # Select columns for plotting
            numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
            
            if len(numeric_cols) >= 2:
                col1, col2 = st.columns(2)
                
                with col1:
                    x_axis = st.selectbox("X-axis", numeric_cols)
                with col2:
                    y_axis = st.selectbox("Y-axis", numeric_cols)
                
                # Create scatter plot
                fig = px.scatter(df, x=x_axis, y=y_axis)
                st.plotly_chart(fig)
        else:
            st.warning("Please upload data first!")
    
    elif page == "Model Training":
        st.header("Model Training")
        
        if 'data' in st.session_state:
            df = st.session_state['data']
            
            # Select target variable
            target = st.selectbox("Select target variable", df.columns)
            
            # Select features
            features = st.multiselect("Select features", 
                                    [col for col in df.columns if col != target])
            
            if features and target:
                # Model selection
                model_type = st.selectbox("Select model", 
                                        ["Linear Regression", "Random Forest", "SVM"])
                
                if st.button("Train Model"):
                    X = df[features]
                    y = df[target]
                    
                    # Split data
                    X_train, X_test, y_train, y_test = train_test_split(
                        X, y, test_size=0.2, random_state=42
                    )
                    
                    # Train model based on selection
                    if model_type == "Linear Regression":
                        model = LinearRegression()
                    elif model_type == "Random Forest":
                        model = RandomForestRegressor()
                    else:
                        model = SVC()
                    
                    model.fit(X_train, y_train)
                    predictions = model.predict(X_test)
                    
                    # Display results
                    if model_type != "SVM":  # For regression
                        mse = mean_squared_error(y_test, predictions)
                        r2 = r2_score(y_test, predictions)
                        st.write(f"MSE: {mse:.4f}")
                        st.write(f"R²: {r2:.4f}")
                    
                    # Store model
                    st.session_state['model'] = model
                    st.session_state['features'] = features
                    
                    st.success("Model trained successfully!")
        else:
            st.warning("Please upload data first!")
    
    elif page == "Predictions":
        st.header("Make Predictions")
        
        if 'model' in st.session_state:
            model = st.session_state['model']
            features = st.session_state['features']
            
            st.write("Enter values for prediction:")
            
            # Create input fields for each feature
            input_data = {}
            for feature in features:
                input_data[feature] = st.number_input(f"Enter {feature}")
            
            if st.button("Predict"):
                # Make prediction
                input_df = pd.DataFrame([input_data])
                prediction = model.predict(input_df)
                
                st.write(f"Prediction: {prediction[0]:.4f}")
        else:
            st.warning("Please train a model first!")

# Run the Streamlit app
# if __name__ == "__main__":
#     create_ml_app()


# GRADIO - INTERACTIVE ML INTERFACES

import gradio as gr

def create_gradio_interface():
    """
    Create interactive ML interfaces with Gradio
    """
    
    def predict_sentiment(text):
        """Simple sentiment prediction function"""
        blob = TextBlob(text)
        polarity = blob.sentiment.polarity
        
        if polarity > 0.1:
            return "Positive", polarity
        elif polarity < -0.1:
            return "Negative", polarity
        else:
            return "Neutral", polarity
    
    def classify_image(image):
        """Mock image classification function"""
        # In reality, you would use a trained model here
        return {"cat": 0.7, "dog": 0.3}
    
    # Text interface
    text_interface = gr.Interface(
        fn=predict_sentiment,
        inputs=gr.Textbox(placeholder="Enter text for sentiment analysis..."),
        outputs=[gr.Textbox(label="Sentiment"), gr.Number(label="Polarity Score")],
        title="Sentiment Analysis",
        description="Analyze the sentiment of your text"
    )
    
    # Image interface
    image_interface = gr.Interface(
        fn=classify_image,
        inputs=gr.Image(),
        outputs=gr.Label(num_top_classes=3),
        title="Image Classification",
        description="Upload an image to classify"
    )
    
    # Combine interfaces in tabs
    demo = gr.TabbedInterface(
        [text_interface, image_interface],
        ["Sentiment Analysis", "Image Classification"]
    )
    
    return demo

# Launch Gradio interface
# demo = create_gradio_interface()
# demo.launch()


# ADVANCED TOPICS

# Time Series Analysis
def time_series_analysis():
    """
    Time series analysis and forecasting
    """
    from statsmodels.tsa.arima.model import ARIMA
    from statsmodels.tsa.seasonal import seasonal_decompose
    
    # Generate sample time series data
    dates = pd.date_range('2020-01-01', periods=365, freq='D')
    ts_data = pd.Series(
        np.random.randn(365).cumsum() + np.sin(np.arange(365) * 2 * np.pi / 365) * 10,
        index=dates
    )
    
    # Decompose time series
    decomposition = seasonal_decompose(ts_data, model='additive', period=30)
    
    # Fit ARIMA model
    model = ARIMA(ts_data, order=(1, 1, 1))
    fitted_model = model.fit()
    
    # Make predictions
    forecast = fitted_model.forecast(steps=30)
    
    return ts_data, decomposition, forecast

# Reinforcement Learning (basic Q-learning)
def q_learning_example():
    """
    Simple Q-learning implementation
    """
    import random
    
    class QLearningAgent:
        def __init__(self, states, actions, learning_rate=0.1, discount_factor=0.9):
            self.states = states
            self.actions = actions
            self.learning_rate = learning_rate
            self.discount_factor = discount_factor
            self.q_table = {}
            
            # Initialize Q-table
            for state in states:
                self.q_table[state] = {}
                for action in actions:
                    self.q_table[state][action] = 0.0
        
        def choose_action(self, state, epsilon=0.1):
            """Choose action using epsilon-greedy policy"""
            if random.random() < epsilon:
                return random.choice(self.actions)
            else:
                return max(self.q_table[state], key=self.q_table[state].get)
        
        def update_q_table(self, state, action, reward, next_state):
            """Update Q-table using Q-learning formula"""
            best_next_action = max(self.q_table[next_state], key=self.q_table[next_state].get)
            td_target = reward + self.discount_factor * self.q_table[next_state][best_next_action]
            td_error = td_target - self.q_table[state][action]
            self.q_table[state][action] += self.learning_rate * td_error
    
    # Example usage
    states = ['start', 'middle', 'end']
    actions = ['left', 'right']
    agent = QLearningAgent(states, actions)
    
    return agent

# Feature Engineering utilities
def advanced_feature_engineering(df):
    """
    Advanced feature engineering techniques
    """
    # Polynomial features
    from sklearn.preprocessing import PolynomialFeatures
    poly = PolynomialFeatures(degree=2, include_bias=False)
    
    # Feature selection
    from sklearn.feature_selection import SelectKBest, f_classif, RFE
    from sklearn.ensemble import RandomForestClassifier
    
    # Select best features
    selector = SelectKBest(score_func=f_classif, k=10)
    
    # Recursive feature elimination
    estimator = RandomForestClassifier()
    rfe = RFE(estimator, n_features_to_select=5)
    
    # Feature importance
    rf = RandomForestClassifier()
    # rf.fit(X, y)  # Assuming X, y are defined
    # importances = rf.feature_importances_
    
    return poly, selector, rfe

# Model interpretability
def model_interpretability():
    """
    Model interpretability techniques
    """
    import shap
    from lime import lime_tabular
    
    # SHAP (SHapley Additive exPlanations)
    def explain_with_shap(model, X_train, X_test):
        explainer = shap.Explainer(model, X_train)
        shap_values = explainer(X_test)
        
        # Plot SHAP values
        shap.summary_plot(shap_values, X_test)
        return shap_values
    
    # LIME (Local Interpretable Model-agnostic Explanations)
    def explain_with_lime(model, X_train, X_test, instance_idx=0):
        explainer = lime_tabular.LimeTabularExplainer(
            X_train.values,
            feature_names=X_train.columns,
            class_names=['Class 0', 'Class 1'],
            mode='classification'
        )
        
        explanation = explainer.explain_instance(
            X_test.iloc[instance_idx].values,
            model.predict_proba
        )
        
        return explanation
    
    return explain_with_shap, explain_with_lime


# DEPLOYMENT AND PRODUCTION

# Model serialization
import joblib
import pickle

def save_load_models():
    """
    Save and load machine learning models
    """
    # Save with joblib (recommended for scikit-learn)
    def save_model_joblib(model, filename):
        joblib.dump(model, filename)
    
    def load_model_joblib(filename):
        return joblib.load(filename)
    
    # Save with pickle
    def save_model_pickle(model, filename):
        with open(filename, 'wb') as f:
            pickle.dump(model, f)
    
    def load_model_pickle(filename):
        with open(filename, 'rb') as f:
            return pickle.load(f)
    
    return save_model_joblib, load_model_joblib, save_model_pickle, load_model_pickle

# API creation with FastAPI
from fastapi import FastAPI
from pydantic import BaseModel

def create_ml_api():
    """
    Create a machine learning API with FastAPI
    """
    app = FastAPI(title="ML API", description="Machine Learning API for predictions")
    
    # Load model (in production, do this once at startup)
    # model = joblib.load("model.pkl")
    
    class PredictionRequest(BaseModel):
        features: list
    
    class PredictionResponse(BaseModel):
        prediction: float
        probability: float = None
    
    @app.post("/predict", response_model=PredictionResponse)
    async def predict(request: PredictionRequest):
        """Make predictions using the trained model"""
        try:
            # features_array = np.array(request.features).reshape(1, -1)
            # prediction = model.predict(features_array)[0]
            # probability = model.predict_proba(features_array)[0].max() if hasattr(model, 'predict_proba') else None
            
            # Mock response
            prediction = 0.85
            probability = 0.92
            
            return PredictionResponse(prediction=prediction, probability=probability)
        except Exception as e:
            raise HTTPException(status_code=400, detail=str(e))
    
    @app.get("/health")
    async def health_check():
        """Health check endpoint"""
        return {"status": "healthy"}
    
    return app

# Docker deployment
def create_dockerfile():
    """
    Create Dockerfile for ML application deployment
    """
    dockerfile_content = """
FROM python:3.9-slim

WORKDIR /app

COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY . .

EXPOSE 8000

CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
"""
    
    requirements_content = """
fastapi==0.68.0
uvicorn==0.15.0
pandas==1.3.3
scikit-learn==1.0.2
numpy==1.21.2
joblib==1.0.1
"""
    
    return dockerfile_content, requirements_content

# Monitoring and logging
import logging
from datetime import datetime

def setup_ml_monitoring():
    """
    Setup monitoring and logging for ML applications
    """
    # Configure logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler('ml_app.log'),
            logging.StreamHandler()
        ]
    )
    
    logger = logging.getLogger(__name__)
    
    def log_prediction(features, prediction, model_version="1.0"):
        """Log prediction for monitoring"""
        log_data = {
            "timestamp": datetime.now().isoformat(),
            "features": features,
            "prediction": prediction,
            "model_version": model_version
        }
        logger.info(f"Prediction made: {log_data}")
    
    def log_model_performance(metrics):
        """Log model performance metrics"""
        logger.info(f"Model performance: {metrics}")
    
    return log_prediction, log_model_performance


# BEST PRACTICES AND UTILITIES

def data_validation():
    """
    Data validation and quality checks
    """
    def check_data_quality(df):
        """Comprehensive data quality check"""
        quality_report = {
            "shape": df.shape,
            "missing_values": df.isnull().sum().to_dict(),
            "duplicates": df.duplicated().sum(),
            "data_types": df.dtypes.to_dict(),
            "memory_usage": df.memory_usage(deep=True).sum(),
            "unique_counts": df.nunique().to_dict()
        }
        
        # Check for potential issues
        issues = []
        if quality_report["duplicates"] > 0:
            issues.append(f"Found {quality_report['duplicates']} duplicate rows")
        
        if any(count > 0 for count in quality_report["missing_values"].values()):
            issues.append("Missing values detected")
        
        quality_report["issues"] = issues
        return quality_report
    
    def detect_outliers(df, method="iqr"):
        """Detect outliers in numerical columns"""
        outliers = {}
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        
        for col in numeric_cols:
            if method == "iqr":
                Q1 = df[col].quantile(0.25)
                Q3 = df[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                outliers[col] = df[(df[col] < lower_bound) | (df[col] > upper_bound)].index.tolist()
            
            elif method == "zscore":
                from scipy import stats
                z_scores = np.abs(stats.zscore(df[col]))
                outliers[col] = df[z_scores > 3].index.tolist()
        
        return outliers
    
    return check_data_quality, detect_outliers

def model_evaluation_suite():
    """
    Comprehensive model evaluation utilities
    """
    def evaluate_classification_model(y_true, y_pred, y_prob=None, class_names=None):
        """Comprehensive classification evaluation"""
        from sklearn.metrics import (
            accuracy_score, precision_recall_fscore_support, 
            confusion_matrix, roc_auc_score, roc_curve,
            precision_recall_curve, average_precision_score
        )
        
        # Basic metrics
        accuracy = accuracy_score(y_true, y_pred)
        precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred, average='weighted')
        
        # Confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        
        results = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'confusion_matrix': cm,
            'support': support
        }
        
        # ROC AUC if probabilities provided
        if y_prob is not None:
            if len(np.unique(y_true)) == 2:  # Binary classification
                results['roc_auc'] = roc_auc_score(y_true, y_prob[:, 1])
                results['average_precision'] = average_precision_score(y_true, y_prob[:, 1])
            else:  # Multiclass
                results['roc_auc'] = roc_auc_score(y_true, y_prob, multi_class='ovr')
        
        # Visualization
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # Confusion Matrix
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, 0])
        axes[0, 0].set_title('Confusion Matrix')
        axes[0, 0].set_xlabel('Predicted')
        axes[0, 0].set_ylabel('Actual')
        
        # ROC Curve (for binary classification)
        if y_prob is not None and len(np.unique(y_true)) == 2:
            fpr, tpr, _ = roc_curve(y_true, y_prob[:, 1])
            axes[0, 1].plot(fpr, tpr, label=f'ROC Curve (AUC = {results["roc_auc"]:.3f})')
            axes[0, 1].plot([0, 1], [0, 1], 'k--')
            axes[0, 1].set_xlabel('False Positive Rate')
            axes[0, 1].set_ylabel('True Positive Rate')
            axes[0, 1].set_title('ROC Curve')
            axes[0, 1].legend()
        
        # Precision-Recall Curve (for binary classification)
        if y_prob is not None and len(np.unique(y_true)) == 2:
            precision_curve, recall_curve, _ = precision_recall_curve(y_true, y_prob[:, 1])
            axes[1, 0].plot(recall_curve, precision_curve, 
                           label=f'PR Curve (AP = {results["average_precision"]:.3f})')
            axes[1, 0].set_xlabel('Recall')
            axes[1, 0].set_ylabel('Precision')
            axes[1, 0].set_title('Precision-Recall Curve')
            axes[1, 0].legend()
        
        # Feature importance (if available)
        axes[1, 1].text(0.1, 0.5, f'Accuracy: {accuracy:.3f}\nPrecision: {precision:.3f}\nRecall: {recall:.3f}\nF1-Score: {f1:.3f}')
        axes[1, 1].set_title('Model Metrics Summary')
        axes[1, 1].axis('off')
        
        plt.tight_layout()
        plt.show()
        
        return results
    
    def evaluate_regression_model(y_true, y_pred):
        """Comprehensive regression evaluation"""
        from sklearn.metrics import (
            mean_squared_error, mean_absolute_error, 
            r2_score, explained_variance_score
        )
        
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)
        explained_var = explained_variance_score(y_true, y_pred)
        
        # Calculate residuals
        residuals = y_true - y_pred
        
        results = {
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2_score': r2,
            'explained_variance': explained_var,
            'residuals': residuals
        }
        
        # Visualization
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # Actual vs Predicted
        axes[0, 0].scatter(y_true, y_pred, alpha=0.6)
        axes[0, 0].plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)
        axes[0, 0].set_xlabel('Actual Values')
        axes[0, 0].set_ylabel('Predicted Values')
        axes[0, 0].set_title('Actual vs Predicted')
        
        # Residuals plot
        axes[0, 1].scatter(y_pred, residuals, alpha=0.6)
        axes[0, 1].axhline(y=0, color='r', linestyle='--')
        axes[0, 1].set_xlabel('Predicted Values')
        axes[0, 1].set_ylabel('Residuals')
        axes[0, 1].set_title('Residuals Plot')
        
        # Residuals histogram
        axes[1, 0].hist(residuals, bins=30, alpha=0.7, edgecolor='black')
        axes[1, 0].set_xlabel('Residuals')
        axes[1, 0].set_ylabel('Frequency')
        axes[1, 0].set_title('Residuals Distribution')
        
        # Q-Q plot
        from scipy import stats
        stats.probplot(residuals, dist="norm", plot=axes[1, 1])
        axes[1, 1].set_title('Q-Q Plot')
        
        plt.tight_layout()
        plt.show()
        
        return results
    
    return evaluate_classification_model, evaluate_regression_model

def cross_validation_suite():
    """
    Advanced cross-validation techniques
    """
    from sklearn.model_selection import (
        StratifiedKFold, TimeSeriesSplit, GroupKFold,
        cross_validate, learning_curve, validation_curve
    )
    
    def advanced_cross_validation(model, X, y, cv_type='stratified', n_splits=5, groups=None):
        """Perform advanced cross-validation"""
        
        # Choose CV strategy
        if cv_type == 'stratified':
            cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        elif cv_type == 'time_series':
            cv = TimeSeriesSplit(n_splits=n_splits)
        elif cv_type == 'group':
            cv = GroupKFold(n_splits=n_splits)
        else:
            cv = n_splits
        
        # Scoring metrics
        scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']
        
        # Perform cross-validation
        cv_results = cross_validate(
            model, X, y, cv=cv, scoring=scoring, 
            return_train_score=True, groups=groups
        )
        
        # Summarize results
        results_summary = {}
        for metric in scoring:
            results_summary[metric] = {
                'test_mean': cv_results[f'test_{metric}'].mean(),
                'test_std': cv_results[f'test_{metric}'].std(),
                'train_mean': cv_results[f'train_{metric}'].mean(),
                'train_std': cv_results[f'train_{metric}'].std()
            }
        
        return cv_results, results_summary
    
    def plot_learning_curve(model, X, y, cv=5, train_sizes=None):
        """Plot learning curves"""
        if train_sizes is None:
            train_sizes = np.linspace(0.1, 1.0, 10)
        
        train_sizes, train_scores, val_scores = learning_curve(
            model, X, y, cv=cv, train_sizes=train_sizes, 
            scoring='accuracy', n_jobs=-1
        )
        
        train_mean = np.mean(train_scores, axis=1)
        train_std = np.std(train_scores, axis=1)
        val_mean = np.mean(val_scores, axis=1)
        val_std = np.std(val_scores, axis=1)
        
        plt.figure(figsize=(10, 6))
        plt.plot(train_sizes, train_mean, 'o-', color='blue', label='Training Score')
        plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
        
        plt.plot(train_sizes, val_mean, 'o-', color='red', label='Validation Score')
        plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color='red')
        
        plt.xlabel('Training Set Size')
        plt.ylabel('Accuracy Score')
        plt.title('Learning Curves')
        plt.legend()
        plt.grid(True)
        plt.show()
        
        return train_sizes, train_scores, val_scores
    
    def plot_validation_curve(model, X, y, param_name, param_range, cv=5):
        """Plot validation curves for hyperparameter tuning"""
        train_scores, val_scores = validation_curve(
            model, X, y, param_name=param_name, param_range=param_range,
            cv=cv, scoring='accuracy', n_jobs=-1
        )
        
        train_mean = np.mean(train_scores, axis=1)
        train_std = np.std(train_scores, axis=1)
        val_mean = np.mean(val_scores, axis=1)
        val_std = np.std(val_scores, axis=1)
        
        plt.figure(figsize=(10, 6))
        plt.plot(param_range, train_mean, 'o-', color='blue', label='Training Score')
        plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
        
        plt.plot(param_range, val_mean, 'o-', color='red', label='Validation Score')
        plt.fill_between(param_range, val_mean - val_std, val_mean + val_std, alpha=0.1, color='red')
        
        plt.xlabel(param_name)
        plt.ylabel('Accuracy Score')
        plt.title(f'Validation Curve for {param_name}')
        plt.legend()
        plt.grid(True)
        plt.show()
        
        return train_scores, val_scores
    
    return advanced_cross_validation, plot_learning_curve, plot_validation_curve

def ensemble_methods():
    """
    Advanced ensemble learning techniques
    """
    from sklearn.ensemble import VotingClassifier, BaggingClassifier, AdaBoostClassifier
    from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
    
    def create_voting_ensemble(base_models, X_train, y_train, voting='hard'):
        """Create a voting ensemble"""
        voting_clf = VotingClassifier(
            estimators=base_models,
            voting=voting
        )
        voting_clf.fit(X_train, y_train)
        return voting_clf
    
    def create_stacking_ensemble(base_models, meta_model, X_train, y_train, cv=5):
        """Create a stacking ensemble"""
        from sklearn.ensemble import StackingClassifier
        
        stacking_clf = StackingClassifier(
            estimators=base_models,
            final_estimator=meta_model,
            cv=cv
        )
        stacking_clf.fit(X_train, y_train)
        return stacking_clf
    
    def create_boosting_ensemble(base_estimator, n_estimators=50, learning_rate=1.0):
        """Create an AdaBoost ensemble"""
        ada_boost = AdaBoostClassifier(
            base_estimator=base_estimator,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            random_state=42
        )
        return ada_boost
    
    def compare_ensemble_methods(X_train, X_test, y_train, y_test):
        """Compare different ensemble methods"""
        from sklearn.linear_model import LogisticRegression
        from sklearn.tree import DecisionTreeClassifier
        from sklearn.svm import SVC
        
        # Base models
        base_models = [
            ('lr', LogisticRegression(random_state=42)),
            ('dt', DecisionTreeClassifier(random_state=42)),
            ('svm', SVC(probability=True, random_state=42))
        ]
        
        # Ensemble methods
        ensembles = {
            'Voting (Hard)': VotingClassifier(estimators=base_models, voting='hard'),
            'Voting (Soft)': VotingClassifier(estimators=base_models, voting='soft'),
            'Bagging': BaggingClassifier(random_state=42),
            'AdaBoost': AdaBoostClassifier(random_state=42),
            'Gradient Boosting': GradientBoostingClassifier(random_state=42),
            'Extra Trees': ExtraTreesClassifier(random_state=42)
        }
        
        results = {}
        for name, ensemble in ensembles.items():
            ensemble.fit(X_train, y_train)
            accuracy = ensemble.score(X_test, y_test)
            results[name] = accuracy
            print(f"{name}: {accuracy:.4f}")
        
        return results
    
    return create_voting_ensemble, create_stacking_ensemble, create_boosting_ensemble, compare_ensemble_methods

def automated_ml_pipeline():
    """
    Automated ML pipeline creation
    """
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
    from sklearn.impute import SimpleImputer
    
    def create_preprocessing_pipeline(numeric_features, categorical_features):
        """Create preprocessing pipeline"""
        
        # Numeric pipeline
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])
        
        # Categorical pipeline
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])
        
        # Combine preprocessing steps
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ]
        )
        
        return preprocessor
    
    def create_full_pipeline(preprocessor, model):
        """Create full ML pipeline"""
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])
        return pipeline
    
    def automated_model_selection(X, y, test_size=0.2):
        """Automated model selection and evaluation"""
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
        
        # Identify feature types
        numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
        categorical_features = X.select_dtypes(include=['object']).columns.tolist()
        
        # Create preprocessor
        preprocessor = create_preprocessing_pipeline(numeric_features, categorical_features)
        
        # Models to test
        models = {
            'Logistic Regression': LogisticRegression(random_state=42),
            'Random Forest': RandomForestClassifier(random_state=42),
            'Gradient Boosting': GradientBoostingClassifier(random_state=42),
            'SVM': SVC(random_state=42)
        }
        
        results = {}
        best_model = None
        best_score = 0
        
        for name, model in models.items():
            # Create pipeline
            pipeline = create_full_pipeline(preprocessor, model)
            
            # Train and evaluate
            pipeline.fit(X_train, y_train)
            score = pipeline.score(X_test, y_test)
            
            results[name] = score
            
            if score > best_score:
                best_score = score
                best_model = pipeline
            
            print(f"{name}: {score:.4f}")
        
        return best_model, results
    
    return create_preprocessing_pipeline, create_full_pipeline, automated_model_selection


# PERFORMANCE OPTIMIZATION

def memory_optimization():
    """
    Memory optimization techniques for large datasets
    """
    def optimize_dataframe_memory(df):
        """Optimize DataFrame memory usage"""
        original_memory = df.memory_usage(deep=True).sum()
        
        for col in df.columns:
            col_type = df[col].dtype
            
            if col_type != object:
                c_min = df[col].min()
                c_max = df[col].max()
                
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float32)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)
            else:
                df[col] = df[col].astype('category')
        
        optimized_memory = df.memory_usage(deep=True).sum()
        memory_reduction = (original_memory - optimized_memory) / original_memory * 100
        
        print(f"Memory usage reduced by {memory_reduction:.2f}%")
        print(f"Original: {original_memory / 1024**2:.2f} MB")
        print(f"Optimized: {optimized_memory / 1024**2:.2f} MB")
        
        return df
    
    def chunked_processing(file_path, chunk_size=10000, processing_func=None):
        """Process large files in chunks"""
        results = []
        
        for chunk in pd.read_csv(file_path, chunksize=chunk_size):
            if processing_func:
                processed_chunk = processing_func(chunk)
                results.append(processed_chunk)
            else:
                results.append(chunk)
        
        return pd.concat(results, ignore_index=True) if results else None
    
    return optimize_dataframe_memory, chunked_processing

def parallel_processing():
    """
    Parallel processing techniques for ML workflows
    """
    from multiprocessing import Pool, cpu_count
    from joblib import Parallel, delayed
    
    def parallel_cross_validation(model, X, y, cv_folds=5, n_jobs=-1):
        """Parallel cross-validation"""
        def train_fold(train_idx, test_idx):
            X_train_fold, X_test_fold = X.iloc[train_idx], X.iloc[test_idx]
            y_train_fold, y_test_fold = y.iloc[train_idx], y.iloc[test_idx]
            
            model_clone = clone(model)
            model_clone.fit(X_train_fold, y_train_fold)
            score = model_clone.score(X_test_fold, y_test_fold)
            return score
        
        from sklearn.model_selection import KFold
        from sklearn.base import clone
        
        kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
        
        scores = Parallel(n_jobs=n_jobs)(
            delayed(train_fold)(train_idx, test_idx) 
            for train_idx, test_idx in kf.split(X)
        )
        
        return np.array(scores)
    
    def parallel_hyperparameter_search(model_class, param_grid, X, y, n_jobs=-1):
        """Parallel hyperparameter search"""
        from itertools import product
        from sklearn.model_selection import train_test_split
        
        def evaluate_params(params):
            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
            
            model = model_class(**params)
            model.fit(X_train, y_train)
            score = model.score(X_val, y_val)
            
            return params, score
        
        # Generate all parameter combinations
        param_combinations = [
            dict(zip(param_grid.keys(), values))
            for values in product(*param_grid.values())
        ]
        
        results = Parallel(n_jobs=n_jobs)(
            delayed(evaluate_params)(params) for params in param_combinations
        )
        
        # Find best parameters
        best_params, best_score = max(results, key=lambda x: x[1])
        
        return best_params, best_score, results
    
    return parallel_cross_validation, parallel_hyperparameter_search


# SECURITY AND PRIVACY

def data_privacy():
    """
    Data privacy and security techniques
    """
    def anonymize_data(df, sensitive_columns):
        """Anonymize sensitive data"""
        import hashlib
        
        df_anon = df.copy()
        
        for col in sensitive_columns:
            if col in df_anon.columns:
                # Hash sensitive values
                df_anon[col] = df_anon[col].apply(
                    lambda x: hashlib.sha256(str(x).encode()).hexdigest()[:10]
                )
        
        return df_anon
    
    def add_differential_privacy(data, epsilon=1.0, sensitivity=1.0):
        """Add differential privacy noise"""
        # Add Laplace noise for differential privacy
        noise_scale = sensitivity / epsilon
        noise = np.random.laplace(0, noise_scale, size=data.shape)
        
        return data + noise
    
    def secure_model_inference(model, input_data, encryption_key=None):
        """Secure model inference with basic encryption"""
        # This is a simplified example - use proper encryption in production
        if encryption_key:
            # Encrypt input data (simplified)
            encrypted_input = input_data ^ encryption_key
            prediction = model.predict(encrypted_input.reshape(1, -1))
            return prediction
        else:
            return model.predict(input_data.reshape(1, -1))
    
    return anonymize_data, add_differential_privacy, secure_model_inference


# CONCLUSION AND SUMMARY

def create_project_template():
    """
    Create a template structure for ML projects
    """
    template_structure = """
    ml_project/
    ├── data/
    │   ├── raw/
    │   ├── processed/
    │   └── external/
    ├── notebooks/
    │   ├── 01_data_exploration.ipynb
    │   ├── 02_data_preprocessing.ipynb
    │   ├── 03_model_training.ipynb
    │   └── 04_model_evaluation.ipynb
    ├── src/
    │   ├── __init__.py
    │   ├── data/
    │   │   ├── __init__.py
    │   │   └── data_loader.py
    │   ├── features/
    │   │   ├── __init__.py
    │   │   └── feature_engineering.py
    │   ├── models/
    │   │   ├── __init__.py
    │   │   └── train_model.py
    │   └── visualization/
    │       ├── __init__.py
    │       └── visualize.py
    ├── models/
    │   └── trained_models/
    ├── reports/
    │   └── figures/
    ├── requirements.txt
    ├── setup.py
    ├── Dockerfile
    ├── README.md
    └── config.yaml
    """
    
    return template_structure

# Final utility functions
def get_system_info():
    """Get system information for ML development"""
    import platform
    import psutil
    
    info = {
        "python_version": platform.python_version(),
        "platform": platform.platform(),
        "processor": platform.processor(),
        "cpu_cores": psutil.cpu_count(),
        "memory_gb": round(psutil.virtual_memory().total / (1024**3), 2),
        "gpu_available": torch.cuda.is_available() if 'torch' in globals() else "PyTorch not loaded"
    }
    
    return info

def print_ml_cheatsheet():
    """Print a quick ML cheatsheet"""
    cheatsheet = """
    ═══════════════════════════════════════════════════════════════
                        PYTHON AI/ML QUICK REFERENCE
    ═══════════════════════════════════════════════════════════════
    
    📊 DATA MANIPULATION
    • pandas: df.head(), df.info(), df.describe(), df.groupby()
    • numpy: np.array(), np.mean(), np.std(), np.reshape()
    
    🎨 VISUALIZATION
    • matplotlib: plt.plot(), plt.scatter(), plt.hist()
    • seaborn: sns.heatmap(), sns.boxplot(), sns.pairplot()
    
    🤖 MACHINE LEARNING
    • sklearn: train_test_split(), StandardScaler(), GridSearchCV()
    • Models: LinearRegression(), RandomForestClassifier(), SVC()
    
    🧠 DEEP LEARNING
    • TensorFlow/Keras: Sequential(), Dense(), Conv2D(), compile(), fit()
    • PyTorch: nn.Module, nn.Linear(), optim.Adam(), DataLoader()
    
    🔤 NLP
    • transformers: pipeline(), AutoTokenizer(), AutoModel()
    • nltk: word_tokenize(), stopwords, sentiment analysis
    
    📱 DEPLOYMENT
    • Streamlit: st.title(), st.selectbox(), st.button()
    • FastAPI: @app.post(), BaseModel, uvicorn
    
    ⚡ OPTIMIZATION
    • Memory: optimize dtypes, chunked processing
    • Parallel: joblib.Parallel(), multiprocessing
    
    🔒 BEST PRACTICES
    • Cross-validation, feature scaling, hyperparameter tuning
    • Model evaluation, data validation, version control
    • Documentation, testing, monitoring
    
    ═══════════════════════════════════════════════════════════════
    """
    
    print(cheatsheet)

# Print system info and cheatsheet when module is loaded
if __name__ == "__main__":
    print("🚀 Python AI/ML Reference Loaded Successfully!")
    print("\n📋 System Information:")
    system_info = get_system_info()
    for key, value in system_info.items():
        print(f"   {key}: {value}")
    
    print("\n" + "="*60)
    print("💡 Quick Start Tips:")
    print("1. Start with data exploration: df.head(), df.info(), df.describe()")
    print("2. Visualize your data: plt.hist(), sns.pairplot()")
    print("3. Preprocess: train_test_split(), StandardScaler()")
    print("4. Train models: fit(), predict(), score()")
    print("5. Evaluate: classification_report(), confusion_matrix()")
    print("6. Deploy: Streamlit or FastAPI")
    print("="*60)
    
    # Uncomment to print full cheatsheet
    # print_ml_cheatsheet()

"""
═══════════════════════════════════════════════════════════════════════════════
                            END OF REFERENCE
═══════════════════════════════════════════════════════════════════════════════

This comprehensive Python AI/ML reference covers:

✅ Essential Libraries: NumPy, Pandas, Matplotlib, Seaborn
✅ Machine Learning: Scikit-learn, model evaluation, cross-validation
✅ Deep Learning: TensorFlow/Keras, PyTorch, neural networks
✅ NLP: Transformers, NLTK, spaCy, text processing
✅ Computer Vision: OpenCV, image processing, object detection
✅ LLM Integration: OpenAI API, Anthropic Claude, LangChain
✅ Web Applications: Streamlit, Gradio, FastAPI
✅ Advanced Topics: Ensemble methods, time series, reinforcement learning
✅ Deployment: Docker, API creation, model serialization
✅ Best Practices: Data validation, performance optimization, security
✅ Production: Monitoring, logging, automated ML pipelines

For more advanced topics and latest updates, visit:
- TensorFlow: https://tensorflow.org
- PyTorch: https://pytorch.org
- Scikit-learn: https://scikit-learn.org
- Hugging Face: https://huggingface.co
- OpenAI: https://openai.com
- Anthropic: https://anthropic.com

Happy coding! 🎉
"""