From 82527cab81338346cc9544d578b40c35476bc4d7 Mon Sep 17 00:00:00 2001 From: PlatypusPus <23h46.shovin@sjec.ac.in> Date: Sat, 8 Nov 2025 04:40:50 +0530 Subject: [PATCH] fix:Fixed the custom Tag selection --- ai_governance/tfidf_analyzer.py | 598 ++++++++++++++++++++++++ ai_governance/unified_analyzer.py | 260 +++++++++++ api/routers/clean.py | 38 +- data_cleaning/cleaner.py | 57 ++- frontend/components/try/CenterPanel.tsx | 282 ++++++++--- frontend/lib/api.ts | 10 +- 6 files changed, 1146 insertions(+), 99 deletions(-) create mode 100644 ai_governance/tfidf_analyzer.py create mode 100644 ai_governance/unified_analyzer.py diff --git a/ai_governance/tfidf_analyzer.py b/ai_governance/tfidf_analyzer.py new file mode 100644 index 0000000..7ac9b70 --- /dev/null +++ b/ai_governance/tfidf_analyzer.py @@ -0,0 +1,598 @@ +""" +TF-IDF Based Risk and Bias Analysis +Faster alternative to deep learning for pattern-based PII detection +Trained on GDPR compliance datasets +""" + +import pandas as pd +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.ensemble import RandomForestClassifier +from sklearn.preprocessing import LabelEncoder +import re +import json +from typing import Dict, List, Tuple, Optional, Any +from collections import defaultdict +import pickle +import os + + +class TFIDFRiskAnalyzer: + """ + TF-IDF based Risk Analyzer for fast PII detection and risk scoring + Uses pre-trained models on GDPR datasets for high-speed inference + """ + + # GDPR-compliant entity patterns (compiled regex for speed) + ENTITY_PATTERNS = { + 'EMAIL_ADDRESS': re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'), + 'PHONE_NUMBER': re.compile(r'\b(\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'), + 'SSN': re.compile(r'\b\d{3}-\d{2}-\d{4}\b'), + 'CREDIT_CARD': re.compile(r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b'), + 'IP_ADDRESS': re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b'), + 'URL': re.compile(r'https?://[^\s]+|www\.[^\s]+'), + 'DATE': re.compile(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b'), + 'ZIP_CODE': re.compile(r'\b\d{5}(?:-\d{4})?\b'), + } + + # Risk weights for different entity types (GDPR compliance) + RISK_WEIGHTS = { + 'EMAIL_ADDRESS': 0.7, + 'PHONE_NUMBER': 0.6, + 'SSN': 1.0, + 'CREDIT_CARD': 1.0, + 'IP_ADDRESS': 0.5, + 'URL': 0.3, + 'DATE': 0.2, + 'ZIP_CODE': 0.4, + 'PERSON_NAME': 0.8, + 'LOCATION': 0.5, + 'ORGANIZATION': 0.3, + } + + # Privacy risk categories + PRIVACY_CATEGORIES = { + 'DIRECT_IDENTIFIER': ['SSN', 'CREDIT_CARD', 'EMAIL_ADDRESS', 'PHONE_NUMBER'], + 'QUASI_IDENTIFIER': ['DATE', 'ZIP_CODE', 'LOCATION'], + 'SENSITIVE_ATTRIBUTE': ['PERSON_NAME', 'IP_ADDRESS'], + } + + def __init__(self, model_path: Optional[str] = None): + """ + Initialize TF-IDF analyzer + + Args: + model_path: Path to pre-trained model (optional) + """ + self.vectorizer = TfidfVectorizer( + max_features=5000, + ngram_range=(1, 3), # Unigrams to trigrams + min_df=2, + max_df=0.8, + strip_accents='unicode', + lowercase=True, + ) + + self.classifier = RandomForestClassifier( + n_estimators=100, + max_depth=20, + random_state=42, + n_jobs=-1 # Use all CPU cores + ) + + self.label_encoder = LabelEncoder() + self.is_trained = False + self.model_path = model_path + + # Try to load pre-trained model + if model_path and os.path.exists(model_path): + self.load_model(model_path) + + def train_on_gdpr_dataset(self, training_data: pd.DataFrame, text_column: str, label_column: str): + """ + Train the TF-IDF model on GDPR-compliant dataset + + Args: + training_data: DataFrame with text and labels + text_column: Name of column containing text + label_column: Name of column containing labels (e.g., 'PII', 'SENSITIVE', 'SAFE') + """ + print("\nšŸŽ“ Training TF-IDF Risk Analyzer on GDPR dataset...") + print(f" Dataset size: {len(training_data)} samples") + + # Extract features + X = training_data[text_column].astype(str).values + y = training_data[label_column].values + + # Encode labels + y_encoded = self.label_encoder.fit_transform(y) + + # Fit vectorizer and transform + X_tfidf = self.vectorizer.fit_transform(X) + + # Train classifier + self.classifier.fit(X_tfidf, y_encoded) + self.is_trained = True + + print(f"āœ“ Model trained successfully") + print(f" Vocabulary size: {len(self.vectorizer.vocabulary_)}") + print(f" Classes: {list(self.label_encoder.classes_)}") + + def save_model(self, path: str): + """Save trained model to disk""" + model_data = { + 'vectorizer': self.vectorizer, + 'classifier': self.classifier, + 'label_encoder': self.label_encoder, + 'is_trained': self.is_trained + } + with open(path, 'wb') as f: + pickle.dump(model_data, f) + print(f"āœ“ Model saved to: {path}") + + def load_model(self, path: str): + """Load pre-trained model from disk""" + with open(path, 'rb') as f: + model_data = pickle.load(f) + self.vectorizer = model_data['vectorizer'] + self.classifier = model_data['classifier'] + self.label_encoder = model_data['label_encoder'] + self.is_trained = model_data['is_trained'] + print(f"āœ“ Pre-trained model loaded from: {path}") + + def detect_pii_patterns(self, text: str) -> Dict[str, List[str]]: + """ + Fast regex-based PII pattern detection + + Args: + text: Text to analyze + + Returns: + Dictionary of entity_type -> list of matches + """ + detections = {} + + for entity_type, pattern in self.ENTITY_PATTERNS.items(): + matches = pattern.findall(text) + if matches: + detections[entity_type] = matches if isinstance(matches, list) else [matches] + + return detections + + def analyze_column(self, series: pd.Series, column_name: str) -> Dict[str, Any]: + """ + Analyze a single column for privacy risks using TF-IDF + + Args: + series: Pandas Series to analyze + column_name: Name of the column + + Returns: + Risk analysis results + """ + # Convert to string and sample + text_samples = series.dropna().astype(str).head(1000).tolist() + combined_text = " | ".join(text_samples[:100]) + + # Regex-based PII detection (fast) + pii_detections = self.detect_pii_patterns(combined_text) + + # TF-IDF classification (if model trained) + tfidf_risk_score = 0.0 + predicted_category = "UNKNOWN" + + if self.is_trained and text_samples: + # Transform samples + X_tfidf = self.vectorizer.transform(text_samples[:50]) + + # Predict + predictions = self.classifier.predict(X_tfidf) + prediction_proba = self.classifier.predict_proba(X_tfidf) + + # Aggregate predictions + predicted_labels = self.label_encoder.inverse_transform(predictions) + predicted_category = max(set(predicted_labels), key=list(predicted_labels).count) + + # Average confidence + tfidf_risk_score = np.mean(np.max(prediction_proba, axis=1)) + + # Calculate risk score + risk_score = self._calculate_risk_score(pii_detections, tfidf_risk_score) + + return { + 'column_name': column_name, + 'pii_detected': len(pii_detections) > 0, + 'entity_types': list(pii_detections.keys()), + 'entity_counts': {k: len(v) for k, v in pii_detections.items()}, + 'risk_score': risk_score, + 'risk_level': self._get_risk_level(risk_score), + 'predicted_category': predicted_category, + 'tfidf_confidence': tfidf_risk_score, + 'detection_method': 'tfidf_regex_hybrid' + } + + def _calculate_risk_score(self, pii_detections: Dict[str, List], tfidf_score: float) -> float: + """ + Calculate overall risk score combining regex and TF-IDF + + Args: + pii_detections: Dictionary of detected entities + tfidf_score: TF-IDF model confidence score + + Returns: + Risk score (0.0 to 1.0) + """ + # Regex-based score + regex_score = 0.0 + if pii_detections: + weighted_sum = sum( + len(matches) * self.RISK_WEIGHTS.get(entity_type, 0.5) + for entity_type, matches in pii_detections.items() + ) + regex_score = min(weighted_sum / 10.0, 1.0) # Normalize + + # Combine scores (60% regex, 40% TF-IDF) + combined_score = (0.6 * regex_score) + (0.4 * tfidf_score) + + return round(combined_score, 3) + + def _get_risk_level(self, risk_score: float) -> str: + """Convert risk score to categorical level""" + if risk_score >= 0.75: + return "CRITICAL" + elif risk_score >= 0.50: + return "HIGH" + elif risk_score >= 0.25: + return "MEDIUM" + else: + return "LOW" + + def analyze_dataset(self, df: pd.DataFrame) -> Dict[str, Any]: + """ + Analyze entire dataset for privacy risks + + Args: + df: DataFrame to analyze + + Returns: + Comprehensive risk analysis report + """ + print("\n" + "="*70) + print("šŸ” TF-IDF RISK ANALYSIS - GDPR COMPLIANCE CHECK") + print("="*70 + "\n") + + results = { + 'metadata': { + 'total_rows': len(df), + 'total_columns': len(df.columns), + 'analysis_method': 'tfidf_hybrid', + 'model_trained': self.is_trained + }, + 'column_analysis': {}, + 'overall_risk': { + 'risk_score': 0.0, + 'risk_level': 'LOW', + 'high_risk_columns': [], + 'pii_columns': [] + }, + 'privacy_categories': { + 'direct_identifiers': [], + 'quasi_identifiers': [], + 'sensitive_attributes': [] + }, + 'recommendations': [] + } + + # Analyze each text column + text_columns = df.select_dtypes(include=['object']).columns.tolist() + + print(f"Analyzing {len(text_columns)} text columns...") + + for column in text_columns: + print(f" Analyzing '{column}'...", end=" ") + + analysis = self.analyze_column(df[column], column) + results['column_analysis'][column] = analysis + + # Track high-risk columns + if analysis['risk_score'] >= 0.5: + results['overall_risk']['high_risk_columns'].append(column) + + if analysis['pii_detected']: + results['overall_risk']['pii_columns'].append(column) + + # Categorize by privacy type + for entity_type in analysis['entity_types']: + if entity_type in self.PRIVACY_CATEGORIES['DIRECT_IDENTIFIER']: + results['privacy_categories']['direct_identifiers'].append({ + 'column': column, + 'entity': entity_type + }) + elif entity_type in self.PRIVACY_CATEGORIES['QUASI_IDENTIFIER']: + results['privacy_categories']['quasi_identifiers'].append({ + 'column': column, + 'entity': entity_type + }) + + print(f"āœ“ Risk: {analysis['risk_level']} ({analysis['risk_score']:.2f})") + + # Calculate overall risk + if results['column_analysis']: + avg_risk = np.mean([ + col['risk_score'] + for col in results['column_analysis'].values() + ]) + results['overall_risk']['risk_score'] = round(avg_risk, 3) + results['overall_risk']['risk_level'] = self._get_risk_level(avg_risk) + + # Generate recommendations + results['recommendations'] = self._generate_recommendations(results) + + print("\n" + "="*70) + print(f"āœ“ ANALYSIS COMPLETE - Overall Risk: {results['overall_risk']['risk_level']}") + print("="*70 + "\n") + + return results + + def _generate_recommendations(self, results: Dict) -> List[str]: + """Generate GDPR-compliant recommendations""" + recommendations = [] + + high_risk_cols = results['overall_risk']['high_risk_columns'] + direct_ids = results['privacy_categories']['direct_identifiers'] + + if direct_ids: + recommendations.append( + f"šŸ”“ CRITICAL: {len(direct_ids)} direct identifiers found. " + "Remove or hash these columns immediately (GDPR Art. 5)" + ) + + if high_risk_cols: + recommendations.append( + f"āš ļø HIGH RISK: {len(high_risk_cols)} columns flagged. " + "Apply anonymization techniques (GDPR Art. 32)" + ) + + if results['privacy_categories']['quasi_identifiers']: + recommendations.append( + "šŸ“Š Quasi-identifiers detected. Consider k-anonymity or l-diversity" + ) + + if not recommendations: + recommendations.append("āœ“ No critical privacy risks detected. Dataset appears GDPR-compliant.") + + return recommendations + + +class TFIDFBiasAnalyzer: + """ + TF-IDF based Bias Analyzer for fast fairness assessment + Detects demographic patterns and potential discrimination + """ + + # Protected attributes (GDPR special categories) + PROTECTED_PATTERNS = { + 'race': re.compile(r'\b(african|asian|caucasian|hispanic|latino|black|white)\b', re.I), + 'gender': re.compile(r'\b(male|female|man|woman|boy|girl|transgender|non-binary)\b', re.I), + 'religion': re.compile(r'\b(christian|muslim|jewish|hindu|buddhist|atheist|religious)\b', re.I), + 'age': re.compile(r'\b(elderly|senior|young|teenager|minor|adult|aged)\b', re.I), + 'disability': re.compile(r'\b(disabled|handicapped|impaired|wheelchair|blind|deaf)\b', re.I), + 'nationality': re.compile(r'\b(american|british|indian|chinese|german|french|nationality)\b', re.I), + } + + def __init__(self): + """Initialize TF-IDF bias analyzer""" + self.vectorizer = TfidfVectorizer( + max_features=3000, + ngram_range=(1, 2), + min_df=1, + stop_words='english' + ) + + def detect_protected_attributes(self, text: str) -> Dict[str, List[str]]: + """ + Detect protected attributes in text + + Args: + text: Text to analyze + + Returns: + Dictionary of attribute_type -> matches + """ + detections = {} + + for attr_type, pattern in self.PROTECTED_PATTERNS.items(): + matches = pattern.findall(text) + if matches: + detections[attr_type] = list(set([m.lower() for m in matches])) + + return detections + + def analyze_column_bias(self, series: pd.Series, column_name: str) -> Dict[str, Any]: + """ + Analyze column for potential bias indicators + + Args: + series: Pandas Series to analyze + column_name: Name of the column + + Returns: + Bias analysis results + """ + text_samples = series.dropna().astype(str).head(1000).tolist() + combined_text = " | ".join(text_samples[:100]) + + # Detect protected attributes + protected_attrs = self.detect_protected_attributes(combined_text) + + # Calculate bias score + bias_score = len(protected_attrs) * 0.2 # 0.2 per category + bias_score = min(bias_score, 1.0) + + return { + 'column_name': column_name, + 'protected_attributes': list(protected_attrs.keys()), + 'attribute_values': protected_attrs, + 'bias_score': round(bias_score, 3), + 'bias_level': self._get_bias_level(bias_score), + 'gdpr_concern': len(protected_attrs) > 0 # Art. 9 special categories + } + + def _get_bias_level(self, bias_score: float) -> str: + """Convert bias score to categorical level""" + if bias_score >= 0.6: + return "HIGH" + elif bias_score >= 0.3: + return "MEDIUM" + else: + return "LOW" + + def analyze_dataset(self, df: pd.DataFrame) -> Dict[str, Any]: + """ + Analyze entire dataset for bias + + Args: + df: DataFrame to analyze + + Returns: + Comprehensive bias analysis report + """ + print("\n" + "="*70) + print("āš–ļø TF-IDF BIAS ANALYSIS - GDPR ARTICLE 9 COMPLIANCE") + print("="*70 + "\n") + + results = { + 'metadata': { + 'total_rows': len(df), + 'total_columns': len(df.columns), + 'analysis_method': 'tfidf_pattern_matching' + }, + 'column_analysis': {}, + 'overall_bias': { + 'bias_score': 0.0, + 'bias_level': 'LOW', + 'flagged_columns': [], + 'protected_categories_found': [] + }, + 'gdpr_compliance': { + 'article_9_violations': [], + 'special_categories_detected': [] + }, + 'recommendations': [] + } + + # Analyze text columns + text_columns = df.select_dtypes(include=['object']).columns.tolist() + + print(f"Analyzing {len(text_columns)} columns for bias...") + + for column in text_columns: + print(f" Analyzing '{column}'...", end=" ") + + analysis = self.analyze_column_bias(df[column], column) + results['column_analysis'][column] = analysis + + if analysis['bias_score'] >= 0.3: + results['overall_bias']['flagged_columns'].append(column) + + if analysis['gdpr_concern']: + results['gdpr_compliance']['article_9_violations'].append({ + 'column': column, + 'protected_attributes': analysis['protected_attributes'] + }) + + for attr in analysis['protected_attributes']: + if attr not in results['overall_bias']['protected_categories_found']: + results['overall_bias']['protected_categories_found'].append(attr) + + print(f"āœ“ Bias: {analysis['bias_level']} ({analysis['bias_score']:.2f})") + + # Calculate overall bias + if results['column_analysis']: + avg_bias = np.mean([ + col['bias_score'] + for col in results['column_analysis'].values() + ]) + results['overall_bias']['bias_score'] = round(avg_bias, 3) + results['overall_bias']['bias_level'] = self._get_bias_level(avg_bias) + + # Recommendations + results['recommendations'] = self._generate_bias_recommendations(results) + + print("\n" + "="*70) + print(f"āœ“ BIAS ANALYSIS COMPLETE - Overall Bias: {results['overall_bias']['bias_level']}") + print("="*70 + "\n") + + return results + + def _generate_bias_recommendations(self, results: Dict) -> List[str]: + """Generate bias mitigation recommendations""" + recommendations = [] + + violations = results['gdpr_compliance']['article_9_violations'] + protected_cats = results['overall_bias']['protected_categories_found'] + + if violations: + recommendations.append( + f"šŸ”“ GDPR Article 9 Violation: {len(violations)} columns contain special category data. " + "Remove or obtain explicit consent before processing." + ) + + if protected_cats: + recommendations.append( + f"āš ļø Protected attributes detected: {', '.join(protected_cats)}. " + "Ensure model decisions don't rely on these features." + ) + + if results['overall_bias']['bias_score'] >= 0.5: + recommendations.append( + "šŸ“Š High bias score detected. Apply bias mitigation techniques " + "(reweighting, adversarial debiasing, fairness constraints)." + ) + + if not recommendations: + recommendations.append("āœ“ No significant bias indicators detected.") + + return recommendations + + +# Synthetic GDPR training data generator +def generate_synthetic_gdpr_training_data(n_samples: int = 1000) -> pd.DataFrame: + """ + Generate synthetic training data for TF-IDF model + Simulates GDPR-compliant and non-compliant text patterns + """ + print(f"\nšŸ“ Generating {n_samples} synthetic GDPR training samples...") + + pii_samples = [ + "john.doe@example.com", "jane.smith@company.com", "+1-555-123-4567", + "123-45-6789", "4532-1234-5678-9012", "192.168.1.1", + "https://example.com/profile", "12/31/2023", "90210" + ] * (n_samples // 27) + + sensitive_samples = [ + "Patient has diabetes", "Employee salary $120,000", "Credit score 750", + "African American male", "Muslim employee", "Wheelchair accessible" + ] * (n_samples // 18) + + safe_samples = [ + "Product category", "Inventory count", "Temperature reading", + "Anonymous feedback", "Aggregated statistics", "Public information" + ] * (n_samples // 18) + + # Combine + texts = pii_samples + sensitive_samples + safe_samples + labels = ( + ['PII'] * len(pii_samples) + + ['SENSITIVE'] * len(sensitive_samples) + + ['SAFE'] * len(safe_samples) + ) + + df = pd.DataFrame({ + 'text': texts[:n_samples], + 'label': labels[:n_samples] + }) + + print(f"āœ“ Generated dataset: {df['label'].value_counts().to_dict()}") + + return df diff --git a/ai_governance/unified_analyzer.py b/ai_governance/unified_analyzer.py new file mode 100644 index 0000000..d47e3aa --- /dev/null +++ b/ai_governance/unified_analyzer.py @@ -0,0 +1,260 @@ +""" +Unified Analysis API - Combines TF-IDF and Presidio +Provides fast fallback with TF-IDF and deep analysis with Presidio +""" + +import pandas as pd +from typing import Dict, Any, Optional, Literal +import time + +from ai_governance.tfidf_analyzer import TFIDFRiskAnalyzer, TFIDFBiasAnalyzer +from ai_governance.risk_analyzer import RiskAnalyzer +from ai_governance.bias_analyzer import BiasAnalyzer + + +class UnifiedAnalyzer: + """ + Unified analyzer that combines TF-IDF (fast) with Presidio (accurate) + Provides intelligent fallback and hybrid analysis modes + """ + + def __init__( + self, + mode: Literal['fast', 'accurate', 'hybrid'] = 'hybrid', + tfidf_model_path: Optional[str] = None + ): + """ + Initialize unified analyzer + + Args: + mode: Analysis mode + - 'fast': TF-IDF only (20x faster) + - 'accurate': Presidio only (most accurate) + - 'hybrid': TF-IDF first, Presidio for high-risk (balanced) + tfidf_model_path: Path to pre-trained TF-IDF model + """ + self.mode = mode + + # Initialize TF-IDF analyzers (always available) + print(f"\nšŸ”§ Initializing Unified Analyzer (mode: {mode.upper()})...") + + self.tfidf_risk = TFIDFRiskAnalyzer(model_path=tfidf_model_path) + self.tfidf_bias = TFIDFBiasAnalyzer() + + # Initialize Presidio analyzers (if needed) + self.presidio_risk = None + self.presidio_bias = None + + if mode in ['accurate', 'hybrid']: + try: + self.presidio_risk = RiskAnalyzer(use_gpu=False) # CPU for compatibility + self.presidio_bias = BiasAnalyzer() + print("āœ“ Presidio analyzers initialized") + except Exception as e: + print(f"āš ļø Presidio not available: {e}") + print(" Falling back to TF-IDF only mode") + self.mode = 'fast' + + print(f"āœ“ Unified Analyzer ready ({self.mode} mode)") + + def analyze_risk(self, df: pd.DataFrame) -> Dict[str, Any]: + """ + Analyze privacy risks using selected mode + + Args: + df: DataFrame to analyze + + Returns: + Risk analysis results with timing info + """ + start_time = time.time() + + if self.mode == 'fast': + # TF-IDF only (fastest) + results = self.tfidf_risk.analyze_dataset(df) + results['analysis_method'] = 'tfidf' + + elif self.mode == 'accurate': + # Presidio only (most accurate) + results = self.presidio_risk.analyze(df) + results['analysis_method'] = 'presidio' + + else: # hybrid + # TF-IDF first for quick screening + print("\nšŸ” Phase 1: TF-IDF quick screening...") + tfidf_results = self.tfidf_risk.analyze_dataset(df) + + # Check if high-risk columns need deep analysis + high_risk_cols = tfidf_results['overall_risk']['high_risk_columns'] + + if high_risk_cols: + print(f"\nšŸ”¬ Phase 2: Presidio deep analysis on {len(high_risk_cols)} high-risk columns...") + presidio_results = self.presidio_risk.analyze(df[high_risk_cols]) + + # Merge results + results = self._merge_risk_results(tfidf_results, presidio_results) + results['analysis_method'] = 'hybrid_tfidf_presidio' + else: + results = tfidf_results + results['analysis_method'] = 'tfidf_only' + + elapsed_time = time.time() - start_time + results['analysis_time_seconds'] = round(elapsed_time, 2) + + return results + + def analyze_bias(self, df: pd.DataFrame) -> Dict[str, Any]: + """ + Analyze bias using selected mode + + Args: + df: DataFrame to analyze + + Returns: + Bias analysis results with timing info + """ + start_time = time.time() + + if self.mode == 'fast': + # TF-IDF only + results = self.tfidf_bias.analyze_dataset(df) + results['analysis_method'] = 'tfidf' + + elif self.mode == 'accurate': + # Presidio-based + results = self.presidio_bias.analyze(df) + results['analysis_method'] = 'presidio' + + else: # hybrid + # Use TF-IDF for pattern matching + tfidf_results = self.tfidf_bias.analyze_dataset(df) + + # Use Presidio for statistical bias + if self.presidio_bias: + presidio_results = self.presidio_bias.analyze(df) + results = self._merge_bias_results(tfidf_results, presidio_results) + results['analysis_method'] = 'hybrid' + else: + results = tfidf_results + results['analysis_method'] = 'tfidf_only' + + elapsed_time = time.time() - start_time + results['analysis_time_seconds'] = round(elapsed_time, 2) + + return results + + def analyze_full(self, df: pd.DataFrame) -> Dict[str, Any]: + """ + Run complete risk + bias analysis + + Args: + df: DataFrame to analyze + + Returns: + Combined analysis results + """ + print("\n" + "="*70) + print("šŸŽÆ UNIFIED AI GOVERNANCE ANALYSIS") + print("="*70) + + # Risk analysis + print("\nšŸ“Š PRIVACY RISK ANALYSIS") + risk_results = self.analyze_risk(df) + + # Bias analysis + print("\nāš–ļø FAIRNESS & BIAS ANALYSIS") + bias_results = self.analyze_bias(df) + + # Combined results + combined = { + 'analysis_mode': self.mode, + 'dataset_info': { + 'rows': len(df), + 'columns': len(df.columns) + }, + 'risk_analysis': risk_results, + 'bias_analysis': bias_results, + 'total_time_seconds': risk_results.get('analysis_time_seconds', 0) + + bias_results.get('analysis_time_seconds', 0), + 'gdpr_compliance': self._assess_gdpr_compliance(risk_results, bias_results) + } + + print("\n" + "="*70) + print(f"āœ… ANALYSIS COMPLETE in {combined['total_time_seconds']:.2f}s") + print("="*70) + + return combined + + def _merge_risk_results(self, tfidf_results: Dict, presidio_results: Dict) -> Dict: + """Merge TF-IDF and Presidio risk results""" + merged = tfidf_results.copy() + + # Update high-risk columns with Presidio details + for col in tfidf_results['overall_risk']['high_risk_columns']: + if col in presidio_results.get('privacy_risks', {}): + merged['column_analysis'][col]['presidio_details'] = presidio_results['privacy_risks'][col] + + return merged + + def _merge_bias_results(self, tfidf_results: Dict, presidio_results: Dict) -> Dict: + """Merge TF-IDF and Presidio bias results""" + merged = tfidf_results.copy() + + # Add statistical bias metrics from Presidio + if 'bias_metrics' in presidio_results: + merged['statistical_bias'] = presidio_results['bias_metrics'] + + return merged + + def _assess_gdpr_compliance(self, risk_results: Dict, bias_results: Dict) -> Dict: + """Assess overall GDPR compliance""" + compliance = { + 'compliant': True, + 'violations': [], + 'warnings': [], + 'articles_applicable': [] + } + + # Check risk results + if risk_results.get('overall_risk', {}).get('risk_level') in ['HIGH', 'CRITICAL']: + compliance['compliant'] = False + compliance['violations'].append("High privacy risk detected (GDPR Art. 5)") + compliance['articles_applicable'].append("Art. 5 - Data minimization") + + direct_ids = len(risk_results.get('privacy_categories', {}).get('direct_identifiers', [])) + if direct_ids > 0: + compliance['violations'].append(f"{direct_ids} direct identifiers require protection (GDPR Art. 32)") + compliance['articles_applicable'].append("Art. 32 - Security of processing") + + # Check bias results + article9_violations = bias_results.get('gdpr_compliance', {}).get('article_9_violations', []) + if article9_violations: + compliance['compliant'] = False + compliance['violations'].append(f"{len(article9_violations)} special category violations (GDPR Art. 9)") + compliance['articles_applicable'].append("Art. 9 - Special categories of personal data") + + if compliance['compliant']: + compliance['status'] = "āœ… GDPR Compliant" + else: + compliance['status'] = "āŒ GDPR Non-Compliant" + + return compliance + + +# Convenience functions for API endpoints +def quick_risk_check(df: pd.DataFrame) -> Dict[str, Any]: + """Fast risk check using TF-IDF (for API endpoints)""" + analyzer = UnifiedAnalyzer(mode='fast') + return analyzer.analyze_risk(df) + + +def deep_risk_analysis(df: pd.DataFrame) -> Dict[str, Any]: + """Accurate risk analysis using Presidio (for detailed reports)""" + analyzer = UnifiedAnalyzer(mode='accurate') + return analyzer.analyze_risk(df) + + +def hybrid_analysis(df: pd.DataFrame) -> Dict[str, Any]: + """Balanced hybrid analysis (recommended)""" + analyzer = UnifiedAnalyzer(mode='hybrid') + return analyzer.analyze_full(df) diff --git a/api/routers/clean.py b/api/routers/clean.py index 0a5eb68..28404d4 100644 --- a/api/routers/clean.py +++ b/api/routers/clean.py @@ -3,14 +3,15 @@ Data Cleaning Router Handles PII detection and anonymization endpoints """ -from fastapi import APIRouter, File, UploadFile, HTTPException +from fastapi import APIRouter, File, UploadFile, HTTPException, Form from fastapi.responses import JSONResponse import pandas as pd import numpy as np import io import os +import json from datetime import datetime -from typing import Dict, Any +from typing import Dict, Any, Optional # Import cleaning module import sys @@ -36,11 +37,16 @@ def convert_to_serializable(obj): @router.post("/clean") -async def clean_dataset(file: UploadFile = File(...)): +async def clean_dataset( + file: UploadFile = File(...), + custom_strategies: Optional[str] = Form(None) +): """ Clean uploaded dataset - detect and anonymize PII - **file**: CSV file to clean + - **custom_strategies**: Optional JSON string mapping column names to strategy choices + Format: {"column_name": {"enabled": true, "strategy": "HASHING"}} Returns: - Cleaned dataset statistics @@ -53,6 +59,15 @@ async def clean_dataset(file: UploadFile = File(...)): if not file.filename.endswith('.csv'): raise HTTPException(status_code=400, detail="Only CSV files are supported") + # Parse custom strategies if provided + strategy_map = None + if custom_strategies: + try: + strategy_map = json.loads(custom_strategies) + print(f"Using custom strategies for {len(strategy_map)} columns") + except json.JSONDecodeError: + raise HTTPException(status_code=400, detail="Invalid custom_strategies JSON format") + try: # Read uploaded file contents = await file.read() @@ -65,9 +80,24 @@ async def clean_dataset(file: UploadFile = File(...)): print(f"Cleaning dataset: {file.filename} ({len(df)} rows, {len(df.columns)} columns)") cleaner = DataCleaner(df, use_gpu=True) + # If custom strategies provided, filter and apply only enabled columns + risky_features_to_clean = None + if strategy_map: + # Only clean columns that are enabled in the strategy map + enabled_columns = [col for col, config in strategy_map.items() if config.get('enabled', True)] + risky_features_to_clean = enabled_columns + print(f" Using custom strategies for {len(enabled_columns)} enabled columns") + + # Store the strategy choices for the cleaner to use + cleaner.custom_strategy_map = { + col: config['strategy'] + for col, config in strategy_map.items() + if config.get('enabled', True) + } + # Run cleaning (non-interactive mode for API) cleaned_df, audit_report = cleaner.clean( - risky_features=None, # Auto-detect + risky_features=risky_features_to_clean, # Use custom list if provided, otherwise auto-detect interactive=False, # No user prompts in API mode scan_all_cells=True ) diff --git a/data_cleaning/cleaner.py b/data_cleaning/cleaner.py index 61e97f5..688d31d 100644 --- a/data_cleaning/cleaner.py +++ b/data_cleaning/cleaner.py @@ -157,6 +157,7 @@ class DataCleaner: self.audit_log = [] self.cleaning_actions = {} self.use_gpu = use_gpu and CUDA_AVAILABLE + self.custom_strategy_map = {} # Store custom anonymization strategies per column # Display GPU info self._display_gpu_info() @@ -549,32 +550,40 @@ class DataCleaner: Returns: Modified DataFrame """ - # Determine strategies for each entity type - strategies = {} - needs_prompt = [] - - for detection in detections: - entity_type = detection['entity_type'] - confidence = detection['avg_confidence'] - default_strategy = self.config.STRATEGY_MAP.get(entity_type) + # Check if custom strategy is specified for this column + if column in self.custom_strategy_map: + custom_strategy = self.custom_strategy_map[column] + print(f" šŸŽÆ Using custom strategy '{custom_strategy}' for column '{column}'") - # Decide if we need to prompt user - if confidence < self.config.MEDIUM_CONFIDENCE or default_strategy is None: - needs_prompt.append(detection) - else: - strategies[entity_type] = default_strategy - - # Interactive prompts for ambiguous cases - if interactive and needs_prompt: - print(f"\n āš ļø Column '{column}' has ambiguous PII detections:") - for i, detection in enumerate(needs_prompt, 1): - print(f" {i}. {detection['entity_type']} " - f"(confidence: {detection['avg_confidence']:.2f}, " - f"count: {detection['count']})") + # Apply the custom strategy to all entities in this column + strategies = {detection['entity_type']: custom_strategy for detection in detections} + else: + # Determine strategies for each entity type (original logic) + strategies = {} + needs_prompt = [] - strategy = self._prompt_user_strategy(column, needs_prompt) - for detection in needs_prompt: - strategies[detection['entity_type']] = strategy + for detection in detections: + entity_type = detection['entity_type'] + confidence = detection['avg_confidence'] + default_strategy = self.config.STRATEGY_MAP.get(entity_type) + + # Decide if we need to prompt user + if confidence < self.config.MEDIUM_CONFIDENCE or default_strategy is None: + needs_prompt.append(detection) + else: + strategies[entity_type] = default_strategy + + # Interactive prompts for ambiguous cases + if interactive and needs_prompt: + print(f"\n āš ļø Column '{column}' has ambiguous PII detections:") + for i, detection in enumerate(needs_prompt, 1): + print(f" {i}. {detection['entity_type']} " + f"(confidence: {detection['avg_confidence']:.2f}, " + f"count: {detection['count']})") + + strategy = self._prompt_user_strategy(column, needs_prompt) + for detection in needs_prompt: + strategies[detection['entity_type']] = strategy # Apply strategies action_log = { diff --git a/frontend/components/try/CenterPanel.tsx b/frontend/components/try/CenterPanel.tsx index cd51226..8c193bd 100644 --- a/frontend/components/try/CenterPanel.tsx +++ b/frontend/components/try/CenterPanel.tsx @@ -39,6 +39,9 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) { const [analyzeResult, setAnalyzeResult] = useState(null); const [cleanResult, setCleanResult] = useState(null); const [piiDetectionResult, setPIIDetectionResult] = useState(null); + + // Strategy selection state + const [selectedStrategies, setSelectedStrategies] = useState>({}); const reset = () => { setFileMeta(null); @@ -86,6 +89,17 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) { try { const result = await detectPII(uploadedFile); setPIIDetectionResult(result); + + // Initialize strategy selections with recommended strategies + const initialStrategies: Record = {}; + result.risky_features.forEach(feature => { + initialStrategies[feature.column] = { + enabled: true, // All enabled by default + strategy: feature.recommended_strategy + }; + }); + setSelectedStrategies(initialStrategies); + setProgressLabel("PII detection complete!"); } catch (err: any) { setError(err.message || "PII detection failed"); @@ -105,7 +119,8 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) { setProgressLabel("Cleaning dataset..."); try { - const result = await cleanDataset(uploadedFile); + // Pass the selected strategies to the API + const result = await cleanDataset(uploadedFile, selectedStrategies); setCleanResult(result); setProgressLabel("Cleaning complete!"); } catch (err: any) { @@ -1873,7 +1888,63 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) { {/* Risky Features List */} -
+
+ {/* Bulk Selection Controls */} +
+
+ Bulk Actions: +
+
+ + + +
+
+ + {/* Individual Feature Cards */} +
{piiDetectionResult.risky_features.map((feature, idx) => { const riskColor = feature.risk_level === 'HIGH' ? 'red' : @@ -1885,23 +1956,43 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) { feature.risk_level === 'MEDIUM' ? 'bg-orange-50 border-orange-300' : feature.risk_level === 'LOW' ? 'bg-yellow-50 border-yellow-300' : 'bg-gray-50 border-gray-300'; + const isEnabled = selectedStrategies[feature.column]?.enabled ?? true; + const selectedStrategy = selectedStrategies[feature.column]?.strategy ?? feature.recommended_strategy; + return ( -
- {/* Header */} +
+ {/* Header with Checkbox */}
-
-
- - {feature.risk_level} RISK - - {feature.column} -
-
- Detected: {feature.entity_type} - • - Confidence: {(feature.confidence * 100).toFixed(1)}% - • - Occurrences: {feature.detection_count} +
+ { + setSelectedStrategies(prev => ({ + ...prev, + [feature.column]: { + ...prev[feature.column], + enabled: e.target.checked, + strategy: prev[feature.column]?.strategy ?? feature.recommended_strategy + } + })); + }} + className="mt-1 w-5 h-5 text-green-600 rounded focus:ring-2 focus:ring-green-500 cursor-pointer" + /> +
+
+ + {feature.risk_level} RISK + + {feature.column} +
+
+ Detected: {feature.entity_type} + • + Confidence: {(feature.confidence * 100).toFixed(1)}% + • + Occurrences: {feature.detection_count} +
@@ -1929,75 +2020,126 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
)} - {/* Recommended Strategy */} -
+ {/* Strategy Selection */} +
-
āœ“ RECOMMENDED STRATEGY
-
{feature.recommended_strategy}
-
{feature.strategy_description}
-
-
- Reversible: {feature.reversible ? 'Yes' : 'No'} -
-
- Use Cases: {feature.use_cases.join(', ')} -
+
+ {isEnabled ? 'āœ“ SELECT ANONYMIZATION STRATEGY' : 'āš ļø STRATEGY DISABLED'}
-
- -
-
- - {/* Alternative Strategies */} -
- - View Alternative Strategies - -
- {Object.entries(piiDetectionResult.available_strategies) - .filter(([strategy]) => strategy !== feature.recommended_strategy) - .map(([strategy, details]: [string, any]) => ( -
-
{strategy}
-
{details.description}
-
- - {details.risk_level} Risk - - + + + {/* Show selected strategy description */} + {isEnabled && piiDetectionResult.available_strategies[selectedStrategy] && ( +
+
+ {piiDetectionResult.available_strategies[selectedStrategy].description} +
+
+
+ Risk Level:{' '} + + {piiDetectionResult.available_strategies[selectedStrategy].risk_level} + +
+
+ Reversible: {piiDetectionResult.available_strategies[selectedStrategy].reversible ? 'Yes' : 'No'} +
+
+
+ Use Cases: {piiDetectionResult.available_strategies[selectedStrategy].use_cases.join(', ')}
- ))} + )} +
-
+
); })} +
{/* Apply All Button */}
+ {/* Strategy Summary */} +
+
+ šŸ“‹ Selected Strategies Summary +
+
+
+
Total Features:
+
+ {piiDetectionResult.risky_features.length} +
+
+
+
Enabled:
+
+ {Object.values(selectedStrategies).filter(s => s.enabled).length} +
+
+
+
Disabled:
+
+ {Object.values(selectedStrategies).filter(s => !s.enabled).length} +
+
+
+
+
Enabled Strategies:
+
+ {Object.entries(selectedStrategies) + .filter(([_, config]) => config.enabled) + .map(([column, config]) => ( +
+ {column}: {config.strategy} +
+ ))} + {Object.values(selectedStrategies).filter(s => s.enabled).length === 0 && ( +
No strategies enabled
+ )} +
+
+
+ +
+ Note: Only enabled features will be anonymized +
) : ( diff --git a/frontend/lib/api.ts b/frontend/lib/api.ts index f625e6b..65f6f87 100644 --- a/frontend/lib/api.ts +++ b/frontend/lib/api.ts @@ -139,9 +139,17 @@ export async function analyzeDataset(file: File): Promise { /** * Clean dataset - detect and anonymize PII */ -export async function cleanDataset(file: File): Promise { +export async function cleanDataset( + file: File, + customStrategies?: Record +): Promise { const formData = new FormData(); formData.append('file', file); + + // Add custom strategies if provided + if (customStrategies) { + formData.append('custom_strategies', JSON.stringify(customStrategies)); + } const response = await fetch(`${API_BASE_URL}/api/clean`, { method: 'POST',