fix:Fixed the custom Tag selection

2026-02-07 22:18:59 +00:00 · 2025-11-08 04:40:50 +05:30
parent 6fd52b9fb1
commit 82527cab81
6 changed files with 1146 additions and 99 deletions
--- a/ai_governance/tfidf_analyzer.py
+++ b/ai_governance/tfidf_analyzer.py
@@ -0,0 +1,598 @@
+"""
+TF-IDF Based Risk and Bias Analysis
+Faster alternative to deep learning for pattern-based PII detection
+Trained on GDPR compliance datasets
+"""
+
+import pandas as pd
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.preprocessing import LabelEncoder
+import re
+import json
+from typing import Dict, List, Tuple, Optional, Any
+from collections import defaultdict
+import pickle
+import os
+
+
+class TFIDFRiskAnalyzer:
+    """
+    TF-IDF based Risk Analyzer for fast PII detection and risk scoring
+    Uses pre-trained models on GDPR datasets for high-speed inference
+    """
+    
+    # GDPR-compliant entity patterns (compiled regex for speed)
+    ENTITY_PATTERNS = {
+        'EMAIL_ADDRESS': re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'),
+        'PHONE_NUMBER': re.compile(r'\b(\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'),
+        'SSN': re.compile(r'\b\d{3}-\d{2}-\d{4}\b'),
+        'CREDIT_CARD': re.compile(r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b'),
+        'IP_ADDRESS': re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b'),
+        'URL': re.compile(r'https?://[^\s]+|www\.[^\s]+'),
+        'DATE': re.compile(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b'),
+        'ZIP_CODE': re.compile(r'\b\d{5}(?:-\d{4})?\b'),
+    }
+    
+    # Risk weights for different entity types (GDPR compliance)
+    RISK_WEIGHTS = {
+        'EMAIL_ADDRESS': 0.7,
+        'PHONE_NUMBER': 0.6,
+        'SSN': 1.0,
+        'CREDIT_CARD': 1.0,
+        'IP_ADDRESS': 0.5,
+        'URL': 0.3,
+        'DATE': 0.2,
+        'ZIP_CODE': 0.4,
+        'PERSON_NAME': 0.8,
+        'LOCATION': 0.5,
+        'ORGANIZATION': 0.3,
+    }
+    
+    # Privacy risk categories
+    PRIVACY_CATEGORIES = {
+        'DIRECT_IDENTIFIER': ['SSN', 'CREDIT_CARD', 'EMAIL_ADDRESS', 'PHONE_NUMBER'],
+        'QUASI_IDENTIFIER': ['DATE', 'ZIP_CODE', 'LOCATION'],
+        'SENSITIVE_ATTRIBUTE': ['PERSON_NAME', 'IP_ADDRESS'],
+    }
+    
+    def __init__(self, model_path: Optional[str] = None):
+        """
+        Initialize TF-IDF analyzer
+        
+        Args:
+            model_path: Path to pre-trained model (optional)
+        """
+        self.vectorizer = TfidfVectorizer(
+            max_features=5000,
+            ngram_range=(1, 3),  # Unigrams to trigrams
+            min_df=2,
+            max_df=0.8,
+            strip_accents='unicode',
+            lowercase=True,
+        )
+        
+        self.classifier = RandomForestClassifier(
+            n_estimators=100,
+            max_depth=20,
+            random_state=42,
+            n_jobs=-1  # Use all CPU cores
+        )
+        
+        self.label_encoder = LabelEncoder()
+        self.is_trained = False
+        self.model_path = model_path
+        
+        # Try to load pre-trained model
+        if model_path and os.path.exists(model_path):
+            self.load_model(model_path)
+    
+    def train_on_gdpr_dataset(self, training_data: pd.DataFrame, text_column: str, label_column: str):
+        """
+        Train the TF-IDF model on GDPR-compliant dataset
+        
+        Args:
+            training_data: DataFrame with text and labels
+            text_column: Name of column containing text
+            label_column: Name of column containing labels (e.g., 'PII', 'SENSITIVE', 'SAFE')
+        """
+        print("\n🎓 Training TF-IDF Risk Analyzer on GDPR dataset...")
+        print(f"   Dataset size: {len(training_data)} samples")
+        
+        # Extract features
+        X = training_data[text_column].astype(str).values
+        y = training_data[label_column].values
+        
+        # Encode labels
+        y_encoded = self.label_encoder.fit_transform(y)
+        
+        # Fit vectorizer and transform
+        X_tfidf = self.vectorizer.fit_transform(X)
+        
+        # Train classifier
+        self.classifier.fit(X_tfidf, y_encoded)
+        self.is_trained = True
+        
+        print(f"✓ Model trained successfully")
+        print(f"   Vocabulary size: {len(self.vectorizer.vocabulary_)}")
+        print(f"   Classes: {list(self.label_encoder.classes_)}")
+    
+    def save_model(self, path: str):
+        """Save trained model to disk"""
+        model_data = {
+            'vectorizer': self.vectorizer,
+            'classifier': self.classifier,
+            'label_encoder': self.label_encoder,
+            'is_trained': self.is_trained
+        }
+        with open(path, 'wb') as f:
+            pickle.dump(model_data, f)
+        print(f"✓ Model saved to: {path}")
+    
+    def load_model(self, path: str):
+        """Load pre-trained model from disk"""
+        with open(path, 'rb') as f:
+            model_data = pickle.load(f)
+        self.vectorizer = model_data['vectorizer']
+        self.classifier = model_data['classifier']
+        self.label_encoder = model_data['label_encoder']
+        self.is_trained = model_data['is_trained']
+        print(f"✓ Pre-trained model loaded from: {path}")
+    
+    def detect_pii_patterns(self, text: str) -> Dict[str, List[str]]:
+        """
+        Fast regex-based PII pattern detection
+        
+        Args:
+            text: Text to analyze
+            
+        Returns:
+            Dictionary of entity_type -> list of matches
+        """
+        detections = {}
+        
+        for entity_type, pattern in self.ENTITY_PATTERNS.items():
+            matches = pattern.findall(text)
+            if matches:
+                detections[entity_type] = matches if isinstance(matches, list) else [matches]
+        
+        return detections
+    
+    def analyze_column(self, series: pd.Series, column_name: str) -> Dict[str, Any]:
+        """
+        Analyze a single column for privacy risks using TF-IDF
+        
+        Args:
+            series: Pandas Series to analyze
+            column_name: Name of the column
+            
+        Returns:
+            Risk analysis results
+        """
+        # Convert to string and sample
+        text_samples = series.dropna().astype(str).head(1000).tolist()
+        combined_text = " | ".join(text_samples[:100])
+        
+        # Regex-based PII detection (fast)
+        pii_detections = self.detect_pii_patterns(combined_text)
+        
+        # TF-IDF classification (if model trained)
+        tfidf_risk_score = 0.0
+        predicted_category = "UNKNOWN"
+        
+        if self.is_trained and text_samples:
+            # Transform samples
+            X_tfidf = self.vectorizer.transform(text_samples[:50])
+            
+            # Predict
+            predictions = self.classifier.predict(X_tfidf)
+            prediction_proba = self.classifier.predict_proba(X_tfidf)
+            
+            # Aggregate predictions
+            predicted_labels = self.label_encoder.inverse_transform(predictions)
+            predicted_category = max(set(predicted_labels), key=list(predicted_labels).count)
+            
+            # Average confidence
+            tfidf_risk_score = np.mean(np.max(prediction_proba, axis=1))
+        
+        # Calculate risk score
+        risk_score = self._calculate_risk_score(pii_detections, tfidf_risk_score)
+        
+        return {
+            'column_name': column_name,
+            'pii_detected': len(pii_detections) > 0,
+            'entity_types': list(pii_detections.keys()),
+            'entity_counts': {k: len(v) for k, v in pii_detections.items()},
+            'risk_score': risk_score,
+            'risk_level': self._get_risk_level(risk_score),
+            'predicted_category': predicted_category,
+            'tfidf_confidence': tfidf_risk_score,
+            'detection_method': 'tfidf_regex_hybrid'
+        }
+    
+    def _calculate_risk_score(self, pii_detections: Dict[str, List], tfidf_score: float) -> float:
+        """
+        Calculate overall risk score combining regex and TF-IDF
+        
+        Args:
+            pii_detections: Dictionary of detected entities
+            tfidf_score: TF-IDF model confidence score
+            
+        Returns:
+            Risk score (0.0 to 1.0)
+        """
+        # Regex-based score
+        regex_score = 0.0
+        if pii_detections:
+            weighted_sum = sum(
+                len(matches) * self.RISK_WEIGHTS.get(entity_type, 0.5)
+                for entity_type, matches in pii_detections.items()
+            )
+            regex_score = min(weighted_sum / 10.0, 1.0)  # Normalize
+        
+        # Combine scores (60% regex, 40% TF-IDF)
+        combined_score = (0.6 * regex_score) + (0.4 * tfidf_score)
+        
+        return round(combined_score, 3)
+    
+    def _get_risk_level(self, risk_score: float) -> str:
+        """Convert risk score to categorical level"""
+        if risk_score >= 0.75:
+            return "CRITICAL"
+        elif risk_score >= 0.50:
+            return "HIGH"
+        elif risk_score >= 0.25:
+            return "MEDIUM"
+        else:
+            return "LOW"
+    
+    def analyze_dataset(self, df: pd.DataFrame) -> Dict[str, Any]:
+        """
+        Analyze entire dataset for privacy risks
+        
+        Args:
+            df: DataFrame to analyze
+            
+        Returns:
+            Comprehensive risk analysis report
+        """
+        print("\n" + "="*70)
+        print("🔍 TF-IDF RISK ANALYSIS - GDPR COMPLIANCE CHECK")
+        print("="*70 + "\n")
+        
+        results = {
+            'metadata': {
+                'total_rows': len(df),
+                'total_columns': len(df.columns),
+                'analysis_method': 'tfidf_hybrid',
+                'model_trained': self.is_trained
+            },
+            'column_analysis': {},
+            'overall_risk': {
+                'risk_score': 0.0,
+                'risk_level': 'LOW',
+                'high_risk_columns': [],
+                'pii_columns': []
+            },
+            'privacy_categories': {
+                'direct_identifiers': [],
+                'quasi_identifiers': [],
+                'sensitive_attributes': []
+            },
+            'recommendations': []
+        }
+        
+        # Analyze each text column
+        text_columns = df.select_dtypes(include=['object']).columns.tolist()
+        
+        print(f"Analyzing {len(text_columns)} text columns...")
+        
+        for column in text_columns:
+            print(f"  Analyzing '{column}'...", end=" ")
+            
+            analysis = self.analyze_column(df[column], column)
+            results['column_analysis'][column] = analysis
+            
+            # Track high-risk columns
+            if analysis['risk_score'] >= 0.5:
+                results['overall_risk']['high_risk_columns'].append(column)
+            
+            if analysis['pii_detected']:
+                results['overall_risk']['pii_columns'].append(column)
+                
+                # Categorize by privacy type
+                for entity_type in analysis['entity_types']:
+                    if entity_type in self.PRIVACY_CATEGORIES['DIRECT_IDENTIFIER']:
+                        results['privacy_categories']['direct_identifiers'].append({
+                            'column': column,
+                            'entity': entity_type
+                        })
+                    elif entity_type in self.PRIVACY_CATEGORIES['QUASI_IDENTIFIER']:
+                        results['privacy_categories']['quasi_identifiers'].append({
+                            'column': column,
+                            'entity': entity_type
+                        })
+            
+            print(f"✓ Risk: {analysis['risk_level']} ({analysis['risk_score']:.2f})")
+        
+        # Calculate overall risk
+        if results['column_analysis']:
+            avg_risk = np.mean([
+                col['risk_score'] 
+                for col in results['column_analysis'].values()
+            ])
+            results['overall_risk']['risk_score'] = round(avg_risk, 3)
+            results['overall_risk']['risk_level'] = self._get_risk_level(avg_risk)
+        
+        # Generate recommendations
+        results['recommendations'] = self._generate_recommendations(results)
+        
+        print("\n" + "="*70)
+        print(f"✓ ANALYSIS COMPLETE - Overall Risk: {results['overall_risk']['risk_level']}")
+        print("="*70 + "\n")
+        
+        return results
+    
+    def _generate_recommendations(self, results: Dict) -> List[str]:
+        """Generate GDPR-compliant recommendations"""
+        recommendations = []
+        
+        high_risk_cols = results['overall_risk']['high_risk_columns']
+        direct_ids = results['privacy_categories']['direct_identifiers']
+        
+        if direct_ids:
+            recommendations.append(
+                f"🔴 CRITICAL: {len(direct_ids)} direct identifiers found. "
+                "Remove or hash these columns immediately (GDPR Art. 5)"
+            )
+        
+        if high_risk_cols:
+            recommendations.append(
+                f"⚠️  HIGH RISK: {len(high_risk_cols)} columns flagged. "
+                "Apply anonymization techniques (GDPR Art. 32)"
+            )
+        
+        if results['privacy_categories']['quasi_identifiers']:
+            recommendations.append(
+                "📊 Quasi-identifiers detected. Consider k-anonymity or l-diversity"
+            )
+        
+        if not recommendations:
+            recommendations.append("✓ No critical privacy risks detected. Dataset appears GDPR-compliant.")
+        
+        return recommendations
+
+
+class TFIDFBiasAnalyzer:
+    """
+    TF-IDF based Bias Analyzer for fast fairness assessment
+    Detects demographic patterns and potential discrimination
+    """
+    
+    # Protected attributes (GDPR special categories)
+    PROTECTED_PATTERNS = {
+        'race': re.compile(r'\b(african|asian|caucasian|hispanic|latino|black|white)\b', re.I),
+        'gender': re.compile(r'\b(male|female|man|woman|boy|girl|transgender|non-binary)\b', re.I),
+        'religion': re.compile(r'\b(christian|muslim|jewish|hindu|buddhist|atheist|religious)\b', re.I),
+        'age': re.compile(r'\b(elderly|senior|young|teenager|minor|adult|aged)\b', re.I),
+        'disability': re.compile(r'\b(disabled|handicapped|impaired|wheelchair|blind|deaf)\b', re.I),
+        'nationality': re.compile(r'\b(american|british|indian|chinese|german|french|nationality)\b', re.I),
+    }
+    
+    def __init__(self):
+        """Initialize TF-IDF bias analyzer"""
+        self.vectorizer = TfidfVectorizer(
+            max_features=3000,
+            ngram_range=(1, 2),
+            min_df=1,
+            stop_words='english'
+        )
+    
+    def detect_protected_attributes(self, text: str) -> Dict[str, List[str]]:
+        """
+        Detect protected attributes in text
+        
+        Args:
+            text: Text to analyze
+            
+        Returns:
+            Dictionary of attribute_type -> matches
+        """
+        detections = {}
+        
+        for attr_type, pattern in self.PROTECTED_PATTERNS.items():
+            matches = pattern.findall(text)
+            if matches:
+                detections[attr_type] = list(set([m.lower() for m in matches]))
+        
+        return detections
+    
+    def analyze_column_bias(self, series: pd.Series, column_name: str) -> Dict[str, Any]:
+        """
+        Analyze column for potential bias indicators
+        
+        Args:
+            series: Pandas Series to analyze
+            column_name: Name of the column
+            
+        Returns:
+            Bias analysis results
+        """
+        text_samples = series.dropna().astype(str).head(1000).tolist()
+        combined_text = " | ".join(text_samples[:100])
+        
+        # Detect protected attributes
+        protected_attrs = self.detect_protected_attributes(combined_text)
+        
+        # Calculate bias score
+        bias_score = len(protected_attrs) * 0.2  # 0.2 per category
+        bias_score = min(bias_score, 1.0)
+        
+        return {
+            'column_name': column_name,
+            'protected_attributes': list(protected_attrs.keys()),
+            'attribute_values': protected_attrs,
+            'bias_score': round(bias_score, 3),
+            'bias_level': self._get_bias_level(bias_score),
+            'gdpr_concern': len(protected_attrs) > 0  # Art. 9 special categories
+        }
+    
+    def _get_bias_level(self, bias_score: float) -> str:
+        """Convert bias score to categorical level"""
+        if bias_score >= 0.6:
+            return "HIGH"
+        elif bias_score >= 0.3:
+            return "MEDIUM"
+        else:
+            return "LOW"
+    
+    def analyze_dataset(self, df: pd.DataFrame) -> Dict[str, Any]:
+        """
+        Analyze entire dataset for bias
+        
+        Args:
+            df: DataFrame to analyze
+            
+        Returns:
+            Comprehensive bias analysis report
+        """
+        print("\n" + "="*70)
+        print("⚖️  TF-IDF BIAS ANALYSIS - GDPR ARTICLE 9 COMPLIANCE")
+        print("="*70 + "\n")
+        
+        results = {
+            'metadata': {
+                'total_rows': len(df),
+                'total_columns': len(df.columns),
+                'analysis_method': 'tfidf_pattern_matching'
+            },
+            'column_analysis': {},
+            'overall_bias': {
+                'bias_score': 0.0,
+                'bias_level': 'LOW',
+                'flagged_columns': [],
+                'protected_categories_found': []
+            },
+            'gdpr_compliance': {
+                'article_9_violations': [],
+                'special_categories_detected': []
+            },
+            'recommendations': []
+        }
+        
+        # Analyze text columns
+        text_columns = df.select_dtypes(include=['object']).columns.tolist()
+        
+        print(f"Analyzing {len(text_columns)} columns for bias...")
+        
+        for column in text_columns:
+            print(f"  Analyzing '{column}'...", end=" ")
+            
+            analysis = self.analyze_column_bias(df[column], column)
+            results['column_analysis'][column] = analysis
+            
+            if analysis['bias_score'] >= 0.3:
+                results['overall_bias']['flagged_columns'].append(column)
+            
+            if analysis['gdpr_concern']:
+                results['gdpr_compliance']['article_9_violations'].append({
+                    'column': column,
+                    'protected_attributes': analysis['protected_attributes']
+                })
+                
+                for attr in analysis['protected_attributes']:
+                    if attr not in results['overall_bias']['protected_categories_found']:
+                        results['overall_bias']['protected_categories_found'].append(attr)
+            
+            print(f"✓ Bias: {analysis['bias_level']} ({analysis['bias_score']:.2f})")
+        
+        # Calculate overall bias
+        if results['column_analysis']:
+            avg_bias = np.mean([
+                col['bias_score']
+                for col in results['column_analysis'].values()
+            ])
+            results['overall_bias']['bias_score'] = round(avg_bias, 3)
+            results['overall_bias']['bias_level'] = self._get_bias_level(avg_bias)
+        
+        # Recommendations
+        results['recommendations'] = self._generate_bias_recommendations(results)
+        
+        print("\n" + "="*70)
+        print(f"✓ BIAS ANALYSIS COMPLETE - Overall Bias: {results['overall_bias']['bias_level']}")
+        print("="*70 + "\n")
+        
+        return results
+    
+    def _generate_bias_recommendations(self, results: Dict) -> List[str]:
+        """Generate bias mitigation recommendations"""
+        recommendations = []
+        
+        violations = results['gdpr_compliance']['article_9_violations']
+        protected_cats = results['overall_bias']['protected_categories_found']
+        
+        if violations:
+            recommendations.append(
+                f"🔴 GDPR Article 9 Violation: {len(violations)} columns contain special category data. "
+                "Remove or obtain explicit consent before processing."
+            )
+        
+        if protected_cats:
+            recommendations.append(
+                f"⚠️  Protected attributes detected: {', '.join(protected_cats)}. "
+                "Ensure model decisions don't rely on these features."
+            )
+        
+        if results['overall_bias']['bias_score'] >= 0.5:
+            recommendations.append(
+                "📊 High bias score detected. Apply bias mitigation techniques "
+                "(reweighting, adversarial debiasing, fairness constraints)."
+            )
+        
+        if not recommendations:
+            recommendations.append("✓ No significant bias indicators detected.")
+        
+        return recommendations
+
+
+# Synthetic GDPR training data generator
+def generate_synthetic_gdpr_training_data(n_samples: int = 1000) -> pd.DataFrame:
+    """
+    Generate synthetic training data for TF-IDF model
+    Simulates GDPR-compliant and non-compliant text patterns
+    """
+    print(f"\n📝 Generating {n_samples} synthetic GDPR training samples...")
+    
+    pii_samples = [
+        "john.doe@example.com", "jane.smith@company.com", "+1-555-123-4567",
+        "123-45-6789", "4532-1234-5678-9012", "192.168.1.1",
+        "https://example.com/profile", "12/31/2023", "90210"
+    ] * (n_samples // 27)
+    
+    sensitive_samples = [
+        "Patient has diabetes", "Employee salary $120,000", "Credit score 750",
+        "African American male", "Muslim employee", "Wheelchair accessible"
+    ] * (n_samples // 18)
+    
+    safe_samples = [
+        "Product category", "Inventory count", "Temperature reading",
+        "Anonymous feedback", "Aggregated statistics", "Public information"
+    ] * (n_samples // 18)
+    
+    # Combine
+    texts = pii_samples + sensitive_samples + safe_samples
+    labels = (
+        ['PII'] * len(pii_samples) +
+        ['SENSITIVE'] * len(sensitive_samples) +
+        ['SAFE'] * len(safe_samples)
+    )
+    
+    df = pd.DataFrame({
+        'text': texts[:n_samples],
+        'label': labels[:n_samples]
+    })
+    
+    print(f"✓ Generated dataset: {df['label'].value_counts().to_dict()}")
+    
+    return df
--- a/ai_governance/unified_analyzer.py
+++ b/ai_governance/unified_analyzer.py
@@ -0,0 +1,260 @@
+"""
+Unified Analysis API - Combines TF-IDF and Presidio
+Provides fast fallback with TF-IDF and deep analysis with Presidio
+"""
+
+import pandas as pd
+from typing import Dict, Any, Optional, Literal
+import time
+
+from ai_governance.tfidf_analyzer import TFIDFRiskAnalyzer, TFIDFBiasAnalyzer
+from ai_governance.risk_analyzer import RiskAnalyzer
+from ai_governance.bias_analyzer import BiasAnalyzer
+
+
+class UnifiedAnalyzer:
+    """
+    Unified analyzer that combines TF-IDF (fast) with Presidio (accurate)
+    Provides intelligent fallback and hybrid analysis modes
+    """
+    
+    def __init__(
+        self, 
+        mode: Literal['fast', 'accurate', 'hybrid'] = 'hybrid',
+        tfidf_model_path: Optional[str] = None
+    ):
+        """
+        Initialize unified analyzer
+        
+        Args:
+            mode: Analysis mode
+                - 'fast': TF-IDF only (20x faster)
+                - 'accurate': Presidio only (most accurate)
+                - 'hybrid': TF-IDF first, Presidio for high-risk (balanced)
+            tfidf_model_path: Path to pre-trained TF-IDF model
+        """
+        self.mode = mode
+        
+        # Initialize TF-IDF analyzers (always available)
+        print(f"\n🔧 Initializing Unified Analyzer (mode: {mode.upper()})...")
+        
+        self.tfidf_risk = TFIDFRiskAnalyzer(model_path=tfidf_model_path)
+        self.tfidf_bias = TFIDFBiasAnalyzer()
+        
+        # Initialize Presidio analyzers (if needed)
+        self.presidio_risk = None
+        self.presidio_bias = None
+        
+        if mode in ['accurate', 'hybrid']:
+            try:
+                self.presidio_risk = RiskAnalyzer(use_gpu=False)  # CPU for compatibility
+                self.presidio_bias = BiasAnalyzer()
+                print("✓ Presidio analyzers initialized")
+            except Exception as e:
+                print(f"⚠️  Presidio not available: {e}")
+                print("   Falling back to TF-IDF only mode")
+                self.mode = 'fast'
+        
+        print(f"✓ Unified Analyzer ready ({self.mode} mode)")
+    
+    def analyze_risk(self, df: pd.DataFrame) -> Dict[str, Any]:
+        """
+        Analyze privacy risks using selected mode
+        
+        Args:
+            df: DataFrame to analyze
+            
+        Returns:
+            Risk analysis results with timing info
+        """
+        start_time = time.time()
+        
+        if self.mode == 'fast':
+            # TF-IDF only (fastest)
+            results = self.tfidf_risk.analyze_dataset(df)
+            results['analysis_method'] = 'tfidf'
+            
+        elif self.mode == 'accurate':
+            # Presidio only (most accurate)
+            results = self.presidio_risk.analyze(df)
+            results['analysis_method'] = 'presidio'
+            
+        else:  # hybrid
+            # TF-IDF first for quick screening
+            print("\n🔍 Phase 1: TF-IDF quick screening...")
+            tfidf_results = self.tfidf_risk.analyze_dataset(df)
+            
+            # Check if high-risk columns need deep analysis
+            high_risk_cols = tfidf_results['overall_risk']['high_risk_columns']
+            
+            if high_risk_cols:
+                print(f"\n🔬 Phase 2: Presidio deep analysis on {len(high_risk_cols)} high-risk columns...")
+                presidio_results = self.presidio_risk.analyze(df[high_risk_cols])
+                
+                # Merge results
+                results = self._merge_risk_results(tfidf_results, presidio_results)
+                results['analysis_method'] = 'hybrid_tfidf_presidio'
+            else:
+                results = tfidf_results
+                results['analysis_method'] = 'tfidf_only'
+        
+        elapsed_time = time.time() - start_time
+        results['analysis_time_seconds'] = round(elapsed_time, 2)
+        
+        return results
+    
+    def analyze_bias(self, df: pd.DataFrame) -> Dict[str, Any]:
+        """
+        Analyze bias using selected mode
+        
+        Args:
+            df: DataFrame to analyze
+            
+        Returns:
+            Bias analysis results with timing info
+        """
+        start_time = time.time()
+        
+        if self.mode == 'fast':
+            # TF-IDF only
+            results = self.tfidf_bias.analyze_dataset(df)
+            results['analysis_method'] = 'tfidf'
+            
+        elif self.mode == 'accurate':
+            # Presidio-based
+            results = self.presidio_bias.analyze(df)
+            results['analysis_method'] = 'presidio'
+            
+        else:  # hybrid
+            # Use TF-IDF for pattern matching
+            tfidf_results = self.tfidf_bias.analyze_dataset(df)
+            
+            # Use Presidio for statistical bias
+            if self.presidio_bias:
+                presidio_results = self.presidio_bias.analyze(df)
+                results = self._merge_bias_results(tfidf_results, presidio_results)
+                results['analysis_method'] = 'hybrid'
+            else:
+                results = tfidf_results
+                results['analysis_method'] = 'tfidf_only'
+        
+        elapsed_time = time.time() - start_time
+        results['analysis_time_seconds'] = round(elapsed_time, 2)
+        
+        return results
+    
+    def analyze_full(self, df: pd.DataFrame) -> Dict[str, Any]:
+        """
+        Run complete risk + bias analysis
+        
+        Args:
+            df: DataFrame to analyze
+            
+        Returns:
+            Combined analysis results
+        """
+        print("\n" + "="*70)
+        print("🎯 UNIFIED AI GOVERNANCE ANALYSIS")
+        print("="*70)
+        
+        # Risk analysis
+        print("\n📊 PRIVACY RISK ANALYSIS")
+        risk_results = self.analyze_risk(df)
+        
+        # Bias analysis
+        print("\n⚖️  FAIRNESS & BIAS ANALYSIS")
+        bias_results = self.analyze_bias(df)
+        
+        # Combined results
+        combined = {
+            'analysis_mode': self.mode,
+            'dataset_info': {
+                'rows': len(df),
+                'columns': len(df.columns)
+            },
+            'risk_analysis': risk_results,
+            'bias_analysis': bias_results,
+            'total_time_seconds': risk_results.get('analysis_time_seconds', 0) + 
+                                 bias_results.get('analysis_time_seconds', 0),
+            'gdpr_compliance': self._assess_gdpr_compliance(risk_results, bias_results)
+        }
+        
+        print("\n" + "="*70)
+        print(f"✅ ANALYSIS COMPLETE in {combined['total_time_seconds']:.2f}s")
+        print("="*70)
+        
+        return combined
+    
+    def _merge_risk_results(self, tfidf_results: Dict, presidio_results: Dict) -> Dict:
+        """Merge TF-IDF and Presidio risk results"""
+        merged = tfidf_results.copy()
+        
+        # Update high-risk columns with Presidio details
+        for col in tfidf_results['overall_risk']['high_risk_columns']:
+            if col in presidio_results.get('privacy_risks', {}):
+                merged['column_analysis'][col]['presidio_details'] = presidio_results['privacy_risks'][col]
+        
+        return merged
+    
+    def _merge_bias_results(self, tfidf_results: Dict, presidio_results: Dict) -> Dict:
+        """Merge TF-IDF and Presidio bias results"""
+        merged = tfidf_results.copy()
+        
+        # Add statistical bias metrics from Presidio
+        if 'bias_metrics' in presidio_results:
+            merged['statistical_bias'] = presidio_results['bias_metrics']
+        
+        return merged
+    
+    def _assess_gdpr_compliance(self, risk_results: Dict, bias_results: Dict) -> Dict:
+        """Assess overall GDPR compliance"""
+        compliance = {
+            'compliant': True,
+            'violations': [],
+            'warnings': [],
+            'articles_applicable': []
+        }
+        
+        # Check risk results
+        if risk_results.get('overall_risk', {}).get('risk_level') in ['HIGH', 'CRITICAL']:
+            compliance['compliant'] = False
+            compliance['violations'].append("High privacy risk detected (GDPR Art. 5)")
+            compliance['articles_applicable'].append("Art. 5 - Data minimization")
+        
+        direct_ids = len(risk_results.get('privacy_categories', {}).get('direct_identifiers', []))
+        if direct_ids > 0:
+            compliance['violations'].append(f"{direct_ids} direct identifiers require protection (GDPR Art. 32)")
+            compliance['articles_applicable'].append("Art. 32 - Security of processing")
+        
+        # Check bias results
+        article9_violations = bias_results.get('gdpr_compliance', {}).get('article_9_violations', [])
+        if article9_violations:
+            compliance['compliant'] = False
+            compliance['violations'].append(f"{len(article9_violations)} special category violations (GDPR Art. 9)")
+            compliance['articles_applicable'].append("Art. 9 - Special categories of personal data")
+        
+        if compliance['compliant']:
+            compliance['status'] = "✅ GDPR Compliant"
+        else:
+            compliance['status'] = "❌ GDPR Non-Compliant"
+        
+        return compliance
+
+
+# Convenience functions for API endpoints
+def quick_risk_check(df: pd.DataFrame) -> Dict[str, Any]:
+    """Fast risk check using TF-IDF (for API endpoints)"""
+    analyzer = UnifiedAnalyzer(mode='fast')
+    return analyzer.analyze_risk(df)
+
+
+def deep_risk_analysis(df: pd.DataFrame) -> Dict[str, Any]:
+    """Accurate risk analysis using Presidio (for detailed reports)"""
+    analyzer = UnifiedAnalyzer(mode='accurate')
+    return analyzer.analyze_risk(df)
+
+
+def hybrid_analysis(df: pd.DataFrame) -> Dict[str, Any]:
+    """Balanced hybrid analysis (recommended)"""
+    analyzer = UnifiedAnalyzer(mode='hybrid')
+    return analyzer.analyze_full(df)
--- a/api/routers/clean.py
+++ b/api/routers/clean.py
@@ -3,14 +3,15 @@ Data Cleaning Router
 Handles PII detection and anonymization endpoints
 """

-from fastapi import APIRouter, File, UploadFile, HTTPException
+from fastapi import APIRouter, File, UploadFile, HTTPException, Form
 from fastapi.responses import JSONResponse
 import pandas as pd
 import numpy as np
 import io
 import os
+import json
 from datetime import datetime
-from typing import Dict, Any
+from typing import Dict, Any, Optional

 # Import cleaning module
 import sys
@@ -36,11 +37,16 @@ def convert_to_serializable(obj):


@router.post("/clean")
-async def clean_dataset(file: UploadFile = File(...)):
+async def clean_dataset(
+    file: UploadFile = File(...),
+    custom_strategies: Optional[str] = Form(None)
+):
    """
    Clean uploaded dataset - detect and anonymize PII
    
    - **file**: CSV file to clean
+    - **custom_strategies**: Optional JSON string mapping column names to strategy choices
+      Format: {"column_name": {"enabled": true, "strategy": "HASHING"}}
    
    Returns:
        - Cleaned dataset statistics
@@ -53,6 +59,15 @@ async def clean_dataset(file: UploadFile = File(...)):
    if not file.filename.endswith('.csv'):
        raise HTTPException(status_code=400, detail="Only CSV files are supported")
    
+    # Parse custom strategies if provided
+    strategy_map = None
+    if custom_strategies:
+        try:
+            strategy_map = json.loads(custom_strategies)
+            print(f"Using custom strategies for {len(strategy_map)} columns")
+        except json.JSONDecodeError:
+            raise HTTPException(status_code=400, detail="Invalid custom_strategies JSON format")
+    
    try:
        # Read uploaded file
        contents = await file.read()
@@ -65,9 +80,24 @@ async def clean_dataset(file: UploadFile = File(...)):
        print(f"Cleaning dataset: {file.filename} ({len(df)} rows, {len(df.columns)} columns)")
        cleaner = DataCleaner(df, use_gpu=True)
        
+        # If custom strategies provided, filter and apply only enabled columns
+        risky_features_to_clean = None
+        if strategy_map:
+            # Only clean columns that are enabled in the strategy map
+            enabled_columns = [col for col, config in strategy_map.items() if config.get('enabled', True)]
+            risky_features_to_clean = enabled_columns
+            print(f"  Using custom strategies for {len(enabled_columns)} enabled columns")
+            
+            # Store the strategy choices for the cleaner to use
+            cleaner.custom_strategy_map = {
+                col: config['strategy'] 
+                for col, config in strategy_map.items() 
+                if config.get('enabled', True)
+            }
+        
        # Run cleaning (non-interactive mode for API)
        cleaned_df, audit_report = cleaner.clean(
-            risky_features=None,  # Auto-detect
+            risky_features=risky_features_to_clean,  # Use custom list if provided, otherwise auto-detect
            interactive=False,    # No user prompts in API mode
            scan_all_cells=True
        )
--- a/data_cleaning/cleaner.py
+++ b/data_cleaning/cleaner.py
@@ -157,6 +157,7 @@ class DataCleaner:
        self.audit_log = []
        self.cleaning_actions = {}
        self.use_gpu = use_gpu and CUDA_AVAILABLE
+        self.custom_strategy_map = {}  # Store custom anonymization strategies per column
        
        # Display GPU info
        self._display_gpu_info()
@@ -549,32 +550,40 @@ class DataCleaner:
        Returns:
            Modified DataFrame
        """
-        # Determine strategies for each entity type
-        strategies = {}
-        needs_prompt = []
+        # Check if custom strategy is specified for this column
+        if column in self.custom_strategy_map:
+            custom_strategy = self.custom_strategy_map[column]
+            print(f"    🎯 Using custom strategy '{custom_strategy}' for column '{column}'")
            
-        for detection in detections:
-            entity_type = detection['entity_type']
-            confidence = detection['avg_confidence']
-            default_strategy = self.config.STRATEGY_MAP.get(entity_type)
+            # Apply the custom strategy to all entities in this column
+            strategies = {detection['entity_type']: custom_strategy for detection in detections}
+        else:
+            # Determine strategies for each entity type (original logic)
+            strategies = {}
+            needs_prompt = []
            
-            # Decide if we need to prompt user
-            if confidence < self.config.MEDIUM_CONFIDENCE or default_strategy is None:
-                needs_prompt.append(detection)
-            else:
-                strategies[entity_type] = default_strategy
+            for detection in detections:
+                entity_type = detection['entity_type']
+                confidence = detection['avg_confidence']
+                default_strategy = self.config.STRATEGY_MAP.get(entity_type)
                
-        # Interactive prompts for ambiguous cases
-        if interactive and needs_prompt:
-            print(f"\n  ⚠️  Column '{column}' has ambiguous PII detections:")
-            for i, detection in enumerate(needs_prompt, 1):
-                print(f"    {i}. {detection['entity_type']} "
-                      f"(confidence: {detection['avg_confidence']:.2f}, "
-                      f"count: {detection['count']})")
+                # Decide if we need to prompt user
+                if confidence < self.config.MEDIUM_CONFIDENCE or default_strategy is None:
+                    needs_prompt.append(detection)
+                else:
+                    strategies[entity_type] = default_strategy
            
-            strategy = self._prompt_user_strategy(column, needs_prompt)
-            for detection in needs_prompt:
-                strategies[detection['entity_type']] = strategy
+            # Interactive prompts for ambiguous cases
+            if interactive and needs_prompt:
+                print(f"\n  ⚠️  Column '{column}' has ambiguous PII detections:")
+                for i, detection in enumerate(needs_prompt, 1):
+                    print(f"    {i}. {detection['entity_type']} "
+                          f"(confidence: {detection['avg_confidence']:.2f}, "
+                          f"count: {detection['count']})")
+                
+                strategy = self._prompt_user_strategy(column, needs_prompt)
+                for detection in needs_prompt:
+                    strategies[detection['entity_type']] = strategy
        
        # Apply strategies
        action_log = {
--- a/frontend/components/try/CenterPanel.tsx
+++ b/frontend/components/try/CenterPanel.tsx
@@ -40,6 +40,9 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 	const [cleanResult, setCleanResult] = useState<CleanResponse | null>(null);
 	const [piiDetectionResult, setPIIDetectionResult] = useState<DetectPIIResponse | null>(null);
 	
+	// Strategy selection state
+	const [selectedStrategies, setSelectedStrategies] = useState<Record<string, { enabled: boolean; strategy: string }>>({});
+
 	const reset = () => {
 		setFileMeta(null);
 		setUploadedFile(null);
@@ -86,6 +89,17 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 		try {
 			const result = await detectPII(uploadedFile);
 			setPIIDetectionResult(result);
+			
+			// Initialize strategy selections with recommended strategies
+			const initialStrategies: Record<string, { enabled: boolean; strategy: string }> = {};
+			result.risky_features.forEach(feature => {
+				initialStrategies[feature.column] = {
+					enabled: true, // All enabled by default
+					strategy: feature.recommended_strategy
+				};
+			});
+			setSelectedStrategies(initialStrategies);
+			
 			setProgressLabel("PII detection complete!");
 		} catch (err: any) {
 			setError(err.message || "PII detection failed");
@@ -105,7 +119,8 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 		setProgressLabel("Cleaning dataset...");
 		
 		try {
-			const result = await cleanDataset(uploadedFile);
+			// Pass the selected strategies to the API
+			const result = await cleanDataset(uploadedFile, selectedStrategies);
 			setCleanResult(result);
 			setProgressLabel("Cleaning complete!");
 		} catch (err: any) {
@@ -1873,7 +1888,63 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 								</div>

 								{/* Risky Features List */}
-								<div className="space-y-3">
+								<div className="space-y-4">
+									{/* Bulk Selection Controls */}
+									<div className="flex items-center justify-between p-3 bg-slate-100 rounded-lg border border-slate-300">
+										<div className="text-sm font-semibold text-slate-700">
+											Bulk Actions:
+										</div>
+										<div className="flex gap-2">
+											<button
+												onClick={() => {
+													const allEnabled: Record<string, { enabled: boolean; strategy: string }> = {};
+													piiDetectionResult.risky_features.forEach(feature => {
+														allEnabled[feature.column] = {
+															enabled: true,
+															strategy: selectedStrategies[feature.column]?.strategy ?? feature.recommended_strategy
+														};
+													});
+													setSelectedStrategies(allEnabled);
+												}}
+												className="px-3 py-1 bg-green-600 text-white text-xs font-semibold rounded hover:bg-green-500"
+											>
+												✓ Enable All
+											</button>
+											<button
+												onClick={() => {
+													const allDisabled: Record<string, { enabled: boolean; strategy: string }> = {};
+													piiDetectionResult.risky_features.forEach(feature => {
+														allDisabled[feature.column] = {
+															enabled: false,
+															strategy: selectedStrategies[feature.column]?.strategy ?? feature.recommended_strategy
+														};
+													});
+													setSelectedStrategies(allDisabled);
+												}}
+												className="px-3 py-1 bg-slate-600 text-white text-xs font-semibold rounded hover:bg-slate-500"
+											>
+												✗ Disable All
+											</button>
+											<button
+												onClick={() => {
+													const reset: Record<string, { enabled: boolean; strategy: string }> = {};
+													piiDetectionResult.risky_features.forEach(feature => {
+														reset[feature.column] = {
+															enabled: true,
+															strategy: feature.recommended_strategy
+														};
+													});
+													setSelectedStrategies(reset);
+												}}
+												className="px-3 py-1 bg-blue-600 text-white text-xs font-semibold rounded hover:bg-blue-500"
+											>
+												↻ Reset to Recommended
+											</button>
+										</div>
+									</div>
+									
+									{/* Individual Feature Cards */}
+									<div className="space-y-3">
 									{piiDetectionResult.risky_features.map((feature, idx) => {
 										const riskColor = 
 											feature.risk_level === 'HIGH' ? 'red' :
@@ -1885,23 +1956,43 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 											feature.risk_level === 'MEDIUM' ? 'bg-orange-50 border-orange-300' :
 											feature.risk_level === 'LOW' ? 'bg-yellow-50 border-yellow-300' : 'bg-gray-50 border-gray-300';
 										
+										const isEnabled = selectedStrategies[feature.column]?.enabled ?? true;
+										const selectedStrategy = selectedStrategies[feature.column]?.strategy ?? feature.recommended_strategy;
+										
 										return (
-											<div key={idx} className={`p-5 rounded-xl border-2 ${bgColor}`}>
-												{/* Header */}
+											<div key={idx} className={`p-5 rounded-xl border-2 ${bgColor} ${!isEnabled ? 'opacity-60' : ''}`}>
+												{/* Header with Checkbox */}
 												<div className="flex items-start justify-between mb-3">
-													<div className="flex-1">
-														<div className="flex items-center gap-3 mb-2">
-															<span className={`px-3 py-1 bg-${riskColor}-600 text-white text-xs font-bold rounded-full`}>
-																{feature.risk_level} RISK
-															</span>
-															<span className="font-mono font-bold text-lg text-slate-800">{feature.column}</span>
-														</div>
-														<div className="text-sm text-slate-700">
-															<span className="font-semibold">Detected:</span> {feature.entity_type}
-															<span className="mx-2">•</span>
-															<span className="font-semibold">Confidence:</span> {(feature.confidence * 100).toFixed(1)}%
-															<span className="mx-2">•</span>
-															<span className="font-semibold">Occurrences:</span> {feature.detection_count}
+													<div className="flex items-start gap-3 flex-1">
+														<input
+															type="checkbox"
+															checked={isEnabled}
+															onChange={(e) => {
+																setSelectedStrategies(prev => ({
+																	...prev,
+																	[feature.column]: {
+																		...prev[feature.column],
+																		enabled: e.target.checked,
+																		strategy: prev[feature.column]?.strategy ?? feature.recommended_strategy
+																	}
+																}));
+															}}
+															className="mt-1 w-5 h-5 text-green-600 rounded focus:ring-2 focus:ring-green-500 cursor-pointer"
+														/>
+														<div className="flex-1">
+															<div className="flex items-center gap-3 mb-2">
+																<span className={`px-3 py-1 bg-${riskColor}-600 text-white text-xs font-bold rounded-full`}>
+																	{feature.risk_level} RISK
+																</span>
+																<span className="font-mono font-bold text-lg text-slate-800">{feature.column}</span>
+															</div>
+															<div className="text-sm text-slate-700">
+																<span className="font-semibold">Detected:</span> {feature.entity_type}
+																<span className="mx-2">•</span>
+																<span className="font-semibold">Confidence:</span> {(feature.confidence * 100).toFixed(1)}%
+																<span className="mx-2">•</span>
+																<span className="font-semibold">Occurrences:</span> {feature.detection_count}
+															</div>
 														</div>
 													</div>
 												</div>
@@ -1929,75 +2020,126 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 													</div>
 												)}

-												{/* Recommended Strategy */}
-												<div className="p-4 bg-white rounded-lg border-2 border-green-300">
+												{/* Strategy Selection */}
+												<div className={`p-4 bg-white rounded-lg border-2 ${isEnabled ? 'border-green-300' : 'border-slate-300'}`}>
 													<div className="flex items-start gap-3">
 														<div className="flex-1">
-															<div className="text-xs font-semibold text-green-700 mb-1">✓ RECOMMENDED STRATEGY</div>
-															<div className="font-bold text-lg text-slate-900">{feature.recommended_strategy}</div>
-															<div className="text-sm text-slate-700 mt-1">{feature.strategy_description}</div>
-															<div className="mt-2 flex gap-4 text-xs text-slate-600">
-																<div>
-																	<strong>Reversible:</strong> {feature.reversible ? 'Yes' : 'No'}
-																</div>
-																<div>
-																	<strong>Use Cases:</strong> {feature.use_cases.join(', ')}
-																</div>
+															<div className="text-xs font-semibold text-slate-700 mb-2">
+																{isEnabled ? '✓ SELECT ANONYMIZATION STRATEGY' : '⚠️ STRATEGY DISABLED'}
 															</div>
-														</div>
-														<button
-															className="px-4 py-2 bg-green-600 text-white text-sm font-semibold rounded-lg hover:bg-green-500"
-															onClick={() => alert(`Apply ${feature.recommended_strategy} to ${feature.column}`)}
-														>
-															Apply
-														</button>
-													</div>
-												</div>
+															<select
+																value={selectedStrategy}
+																onChange={(e) => {
+																	setSelectedStrategies(prev => ({
+																		...prev,
+																		[feature.column]: {
+																			enabled: isEnabled,
+																			strategy: e.target.value
+																		}
+																	}));
+																}}
+																disabled={!isEnabled}
+																className="w-full px-3 py-2 border-2 border-slate-300 rounded-lg text-sm font-semibold focus:outline-none focus:ring-2 focus:ring-green-500 disabled:opacity-50 disabled:cursor-not-allowed"
+															>
+																{Object.keys(piiDetectionResult.available_strategies).map(strategy => (
+																	<option key={strategy} value={strategy}>
+																		{strategy}
+																	</option>
+																))}
+															</select>
 															
-												{/* Alternative Strategies */}
-												<details className="mt-3">
-													<summary className="text-xs font-semibold text-slate-600 cursor-pointer hover:text-slate-800">
-														View Alternative Strategies
-													</summary>
-													<div className="mt-2 grid grid-cols-1 md:grid-cols-2 gap-2">
-														{Object.entries(piiDetectionResult.available_strategies)
-															.filter(([strategy]) => strategy !== feature.recommended_strategy)
-															.map(([strategy, details]: [string, any]) => (
-																<div key={strategy} className="p-3 bg-white rounded border border-slate-200 hover:border-slate-400">
-																	<div className="font-semibold text-sm text-slate-800">{strategy}</div>
-																	<div className="text-xs text-slate-600 mt-1">{details.description}</div>
-																	<div className="mt-2 flex items-center justify-between">
-																		<span className={`px-2 py-0.5 text-xs rounded ${
-																			details.risk_level === 'HIGH' ? 'bg-red-100 text-red-800' :
-																			details.risk_level === 'MEDIUM' ? 'bg-orange-100 text-orange-800' :
-																			'bg-yellow-100 text-yellow-800'
-																		}`}>
-																			{details.risk_level} Risk
-																		</span>
-																		<button
-																			className="px-2 py-1 bg-blue-600 text-white text-xs rounded hover:bg-blue-500"
-																			onClick={() => alert(`Apply ${strategy} to ${feature.column}`)}
-																		>
-																			Use This
-																		</button>
+															{/* Show selected strategy description */}
+															{isEnabled && piiDetectionResult.available_strategies[selectedStrategy] && (
+																<div className="mt-3 p-3 bg-slate-50 rounded-lg border border-slate-200">
+																	<div className="text-sm text-slate-700 mb-2">
+																		{piiDetectionResult.available_strategies[selectedStrategy].description}
+																	</div>
+																	<div className="flex gap-4 text-xs text-slate-600">
+																		<div>
+																			<strong>Risk Level:</strong>{' '}
+																			<span className={`px-2 py-0.5 rounded ${
+																				piiDetectionResult.available_strategies[selectedStrategy].risk_level === 'HIGH' ? 'bg-red-100 text-red-800' :
+																				piiDetectionResult.available_strategies[selectedStrategy].risk_level === 'MEDIUM' ? 'bg-orange-100 text-orange-800' :
+																				'bg-yellow-100 text-yellow-800'
+																			}`}>
+																				{piiDetectionResult.available_strategies[selectedStrategy].risk_level}
+																			</span>
+																		</div>
+																		<div>
+																			<strong>Reversible:</strong> {piiDetectionResult.available_strategies[selectedStrategy].reversible ? 'Yes' : 'No'}
+																		</div>
+																	</div>
+																	<div className="mt-2 text-xs text-slate-600">
+																		<strong>Use Cases:</strong> {piiDetectionResult.available_strategies[selectedStrategy].use_cases.join(', ')}
 																	</div>
 																</div>
-															))}
+															)}
+														</div>
 													</div>
-												</details>
+												</div>
 											</div>
 										);
 									})}
+									</div>
 								</div>

 								{/* Apply All Button */}
 								<div className="sticky bottom-0 p-4 bg-gradient-to-t from-white via-white to-transparent">
+									{/* Strategy Summary */}
+									<div className="mb-4 p-4 bg-blue-50 rounded-lg border-2 border-blue-200">
+										<div className="text-sm font-semibold text-blue-900 mb-2">
+											📋 Selected Strategies Summary
+										</div>
+										<div className="grid grid-cols-2 md:grid-cols-3 gap-2 text-xs">
+											<div className="p-2 bg-white rounded border border-blue-200">
+												<div className="font-semibold text-slate-700">Total Features:</div>
+												<div className="text-2xl font-bold text-blue-900">
+													{piiDetectionResult.risky_features.length}
+												</div>
+											</div>
+											<div className="p-2 bg-white rounded border border-green-200">
+												<div className="font-semibold text-slate-700">Enabled:</div>
+												<div className="text-2xl font-bold text-green-900">
+													{Object.values(selectedStrategies).filter(s => s.enabled).length}
+												</div>
+											</div>
+											<div className="p-2 bg-white rounded border border-slate-200">
+												<div className="font-semibold text-slate-700">Disabled:</div>
+												<div className="text-2xl font-bold text-slate-900">
+													{Object.values(selectedStrategies).filter(s => !s.enabled).length}
+												</div>
+											</div>
+										</div>
+										<div className="mt-3 p-3 bg-white rounded-lg border border-blue-200">
+											<div className="text-xs font-semibold text-slate-600 mb-2">Enabled Strategies:</div>
+											<div className="flex flex-wrap gap-2">
+												{Object.entries(selectedStrategies)
+													.filter(([_, config]) => config.enabled)
+													.map(([column, config]) => (
+														<div key={column} className="px-2 py-1 bg-blue-100 text-blue-900 rounded text-xs font-medium border border-blue-300">
+															<strong>{column}:</strong> {config.strategy}
+														</div>
+													))}
+												{Object.values(selectedStrategies).filter(s => s.enabled).length === 0 && (
+													<div className="text-xs text-slate-500 italic">No strategies enabled</div>
+												)}
+											</div>
+										</div>
+									</div>
+									
 									<button
-										className="w-full py-3 bg-green-600 text-white font-bold rounded-lg hover:bg-green-500 shadow-lg"
-										onClick={() => alert('Apply all recommended strategies and clean dataset')}
+										className="w-full py-3 bg-green-600 text-white font-bold rounded-lg hover:bg-green-500 shadow-lg disabled:opacity-50 disabled:cursor-not-allowed transition-all"
+										disabled={isProcessing || !uploadedFile || Object.values(selectedStrategies).filter(s => s.enabled).length === 0}
+										onClick={handleClean}
 									>
-										✓ Apply All Recommended Strategies & Clean Dataset
+										{isProcessing ? '⏳ Processing...' : 
+										 Object.values(selectedStrategies).filter(s => s.enabled).length === 0 ? 
+										 '⚠️ Enable at least one strategy to clean' :
+										 `✓ Apply ${Object.values(selectedStrategies).filter(s => s.enabled).length} Selected Strategies & Clean Dataset`}
 									</button>
+									<div className="mt-2 text-xs text-center text-slate-500">
+										Note: Only enabled features will be anonymized
+									</div>
 								</div>
 							</div>
 						) : (
--- a/frontend/lib/api.ts
+++ b/frontend/lib/api.ts
@@ -139,10 +139,18 @@ export async function analyzeDataset(file: File): Promise<AnalyzeResponse> {
 /**
 * Clean dataset - detect and anonymize PII
 */
-export async function cleanDataset(file: File): Promise<CleanResponse> {
+export async function cleanDataset(
+  file: File,
+  customStrategies?: Record<string, { enabled: boolean; strategy: string }>
+): Promise<CleanResponse> {
  const formData = new FormData();
  formData.append('file', file);
  
+  // Add custom strategies if provided
+  if (customStrategies) {
+    formData.append('custom_strategies', JSON.stringify(customStrategies));
+  }
+
  const response = await fetch(`${API_BASE_URL}/api/clean`, {
    method: 'POST',
    body: formData,