mirror of
https://github.com/PlatypusPus/MushroomEmpire.git
synced 2026-02-07 22:18:59 +00:00
fix:Fixed the custom Tag selection
This commit is contained in:
598
ai_governance/tfidf_analyzer.py
Normal file
598
ai_governance/tfidf_analyzer.py
Normal file
@@ -0,0 +1,598 @@
|
||||
"""
|
||||
TF-IDF Based Risk and Bias Analysis
|
||||
Faster alternative to deep learning for pattern-based PII detection
|
||||
Trained on GDPR compliance datasets
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
import re
|
||||
import json
|
||||
from typing import Dict, List, Tuple, Optional, Any
|
||||
from collections import defaultdict
|
||||
import pickle
|
||||
import os
|
||||
|
||||
|
||||
class TFIDFRiskAnalyzer:
|
||||
"""
|
||||
TF-IDF based Risk Analyzer for fast PII detection and risk scoring
|
||||
Uses pre-trained models on GDPR datasets for high-speed inference
|
||||
"""
|
||||
|
||||
# GDPR-compliant entity patterns (compiled regex for speed)
|
||||
ENTITY_PATTERNS = {
|
||||
'EMAIL_ADDRESS': re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'),
|
||||
'PHONE_NUMBER': re.compile(r'\b(\+\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b'),
|
||||
'SSN': re.compile(r'\b\d{3}-\d{2}-\d{4}\b'),
|
||||
'CREDIT_CARD': re.compile(r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b'),
|
||||
'IP_ADDRESS': re.compile(r'\b(?:\d{1,3}\.){3}\d{1,3}\b'),
|
||||
'URL': re.compile(r'https?://[^\s]+|www\.[^\s]+'),
|
||||
'DATE': re.compile(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b'),
|
||||
'ZIP_CODE': re.compile(r'\b\d{5}(?:-\d{4})?\b'),
|
||||
}
|
||||
|
||||
# Risk weights for different entity types (GDPR compliance)
|
||||
RISK_WEIGHTS = {
|
||||
'EMAIL_ADDRESS': 0.7,
|
||||
'PHONE_NUMBER': 0.6,
|
||||
'SSN': 1.0,
|
||||
'CREDIT_CARD': 1.0,
|
||||
'IP_ADDRESS': 0.5,
|
||||
'URL': 0.3,
|
||||
'DATE': 0.2,
|
||||
'ZIP_CODE': 0.4,
|
||||
'PERSON_NAME': 0.8,
|
||||
'LOCATION': 0.5,
|
||||
'ORGANIZATION': 0.3,
|
||||
}
|
||||
|
||||
# Privacy risk categories
|
||||
PRIVACY_CATEGORIES = {
|
||||
'DIRECT_IDENTIFIER': ['SSN', 'CREDIT_CARD', 'EMAIL_ADDRESS', 'PHONE_NUMBER'],
|
||||
'QUASI_IDENTIFIER': ['DATE', 'ZIP_CODE', 'LOCATION'],
|
||||
'SENSITIVE_ATTRIBUTE': ['PERSON_NAME', 'IP_ADDRESS'],
|
||||
}
|
||||
|
||||
def __init__(self, model_path: Optional[str] = None):
|
||||
"""
|
||||
Initialize TF-IDF analyzer
|
||||
|
||||
Args:
|
||||
model_path: Path to pre-trained model (optional)
|
||||
"""
|
||||
self.vectorizer = TfidfVectorizer(
|
||||
max_features=5000,
|
||||
ngram_range=(1, 3), # Unigrams to trigrams
|
||||
min_df=2,
|
||||
max_df=0.8,
|
||||
strip_accents='unicode',
|
||||
lowercase=True,
|
||||
)
|
||||
|
||||
self.classifier = RandomForestClassifier(
|
||||
n_estimators=100,
|
||||
max_depth=20,
|
||||
random_state=42,
|
||||
n_jobs=-1 # Use all CPU cores
|
||||
)
|
||||
|
||||
self.label_encoder = LabelEncoder()
|
||||
self.is_trained = False
|
||||
self.model_path = model_path
|
||||
|
||||
# Try to load pre-trained model
|
||||
if model_path and os.path.exists(model_path):
|
||||
self.load_model(model_path)
|
||||
|
||||
def train_on_gdpr_dataset(self, training_data: pd.DataFrame, text_column: str, label_column: str):
|
||||
"""
|
||||
Train the TF-IDF model on GDPR-compliant dataset
|
||||
|
||||
Args:
|
||||
training_data: DataFrame with text and labels
|
||||
text_column: Name of column containing text
|
||||
label_column: Name of column containing labels (e.g., 'PII', 'SENSITIVE', 'SAFE')
|
||||
"""
|
||||
print("\n🎓 Training TF-IDF Risk Analyzer on GDPR dataset...")
|
||||
print(f" Dataset size: {len(training_data)} samples")
|
||||
|
||||
# Extract features
|
||||
X = training_data[text_column].astype(str).values
|
||||
y = training_data[label_column].values
|
||||
|
||||
# Encode labels
|
||||
y_encoded = self.label_encoder.fit_transform(y)
|
||||
|
||||
# Fit vectorizer and transform
|
||||
X_tfidf = self.vectorizer.fit_transform(X)
|
||||
|
||||
# Train classifier
|
||||
self.classifier.fit(X_tfidf, y_encoded)
|
||||
self.is_trained = True
|
||||
|
||||
print(f"✓ Model trained successfully")
|
||||
print(f" Vocabulary size: {len(self.vectorizer.vocabulary_)}")
|
||||
print(f" Classes: {list(self.label_encoder.classes_)}")
|
||||
|
||||
def save_model(self, path: str):
|
||||
"""Save trained model to disk"""
|
||||
model_data = {
|
||||
'vectorizer': self.vectorizer,
|
||||
'classifier': self.classifier,
|
||||
'label_encoder': self.label_encoder,
|
||||
'is_trained': self.is_trained
|
||||
}
|
||||
with open(path, 'wb') as f:
|
||||
pickle.dump(model_data, f)
|
||||
print(f"✓ Model saved to: {path}")
|
||||
|
||||
def load_model(self, path: str):
|
||||
"""Load pre-trained model from disk"""
|
||||
with open(path, 'rb') as f:
|
||||
model_data = pickle.load(f)
|
||||
self.vectorizer = model_data['vectorizer']
|
||||
self.classifier = model_data['classifier']
|
||||
self.label_encoder = model_data['label_encoder']
|
||||
self.is_trained = model_data['is_trained']
|
||||
print(f"✓ Pre-trained model loaded from: {path}")
|
||||
|
||||
def detect_pii_patterns(self, text: str) -> Dict[str, List[str]]:
|
||||
"""
|
||||
Fast regex-based PII pattern detection
|
||||
|
||||
Args:
|
||||
text: Text to analyze
|
||||
|
||||
Returns:
|
||||
Dictionary of entity_type -> list of matches
|
||||
"""
|
||||
detections = {}
|
||||
|
||||
for entity_type, pattern in self.ENTITY_PATTERNS.items():
|
||||
matches = pattern.findall(text)
|
||||
if matches:
|
||||
detections[entity_type] = matches if isinstance(matches, list) else [matches]
|
||||
|
||||
return detections
|
||||
|
||||
def analyze_column(self, series: pd.Series, column_name: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze a single column for privacy risks using TF-IDF
|
||||
|
||||
Args:
|
||||
series: Pandas Series to analyze
|
||||
column_name: Name of the column
|
||||
|
||||
Returns:
|
||||
Risk analysis results
|
||||
"""
|
||||
# Convert to string and sample
|
||||
text_samples = series.dropna().astype(str).head(1000).tolist()
|
||||
combined_text = " | ".join(text_samples[:100])
|
||||
|
||||
# Regex-based PII detection (fast)
|
||||
pii_detections = self.detect_pii_patterns(combined_text)
|
||||
|
||||
# TF-IDF classification (if model trained)
|
||||
tfidf_risk_score = 0.0
|
||||
predicted_category = "UNKNOWN"
|
||||
|
||||
if self.is_trained and text_samples:
|
||||
# Transform samples
|
||||
X_tfidf = self.vectorizer.transform(text_samples[:50])
|
||||
|
||||
# Predict
|
||||
predictions = self.classifier.predict(X_tfidf)
|
||||
prediction_proba = self.classifier.predict_proba(X_tfidf)
|
||||
|
||||
# Aggregate predictions
|
||||
predicted_labels = self.label_encoder.inverse_transform(predictions)
|
||||
predicted_category = max(set(predicted_labels), key=list(predicted_labels).count)
|
||||
|
||||
# Average confidence
|
||||
tfidf_risk_score = np.mean(np.max(prediction_proba, axis=1))
|
||||
|
||||
# Calculate risk score
|
||||
risk_score = self._calculate_risk_score(pii_detections, tfidf_risk_score)
|
||||
|
||||
return {
|
||||
'column_name': column_name,
|
||||
'pii_detected': len(pii_detections) > 0,
|
||||
'entity_types': list(pii_detections.keys()),
|
||||
'entity_counts': {k: len(v) for k, v in pii_detections.items()},
|
||||
'risk_score': risk_score,
|
||||
'risk_level': self._get_risk_level(risk_score),
|
||||
'predicted_category': predicted_category,
|
||||
'tfidf_confidence': tfidf_risk_score,
|
||||
'detection_method': 'tfidf_regex_hybrid'
|
||||
}
|
||||
|
||||
def _calculate_risk_score(self, pii_detections: Dict[str, List], tfidf_score: float) -> float:
|
||||
"""
|
||||
Calculate overall risk score combining regex and TF-IDF
|
||||
|
||||
Args:
|
||||
pii_detections: Dictionary of detected entities
|
||||
tfidf_score: TF-IDF model confidence score
|
||||
|
||||
Returns:
|
||||
Risk score (0.0 to 1.0)
|
||||
"""
|
||||
# Regex-based score
|
||||
regex_score = 0.0
|
||||
if pii_detections:
|
||||
weighted_sum = sum(
|
||||
len(matches) * self.RISK_WEIGHTS.get(entity_type, 0.5)
|
||||
for entity_type, matches in pii_detections.items()
|
||||
)
|
||||
regex_score = min(weighted_sum / 10.0, 1.0) # Normalize
|
||||
|
||||
# Combine scores (60% regex, 40% TF-IDF)
|
||||
combined_score = (0.6 * regex_score) + (0.4 * tfidf_score)
|
||||
|
||||
return round(combined_score, 3)
|
||||
|
||||
def _get_risk_level(self, risk_score: float) -> str:
|
||||
"""Convert risk score to categorical level"""
|
||||
if risk_score >= 0.75:
|
||||
return "CRITICAL"
|
||||
elif risk_score >= 0.50:
|
||||
return "HIGH"
|
||||
elif risk_score >= 0.25:
|
||||
return "MEDIUM"
|
||||
else:
|
||||
return "LOW"
|
||||
|
||||
def analyze_dataset(self, df: pd.DataFrame) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze entire dataset for privacy risks
|
||||
|
||||
Args:
|
||||
df: DataFrame to analyze
|
||||
|
||||
Returns:
|
||||
Comprehensive risk analysis report
|
||||
"""
|
||||
print("\n" + "="*70)
|
||||
print("🔍 TF-IDF RISK ANALYSIS - GDPR COMPLIANCE CHECK")
|
||||
print("="*70 + "\n")
|
||||
|
||||
results = {
|
||||
'metadata': {
|
||||
'total_rows': len(df),
|
||||
'total_columns': len(df.columns),
|
||||
'analysis_method': 'tfidf_hybrid',
|
||||
'model_trained': self.is_trained
|
||||
},
|
||||
'column_analysis': {},
|
||||
'overall_risk': {
|
||||
'risk_score': 0.0,
|
||||
'risk_level': 'LOW',
|
||||
'high_risk_columns': [],
|
||||
'pii_columns': []
|
||||
},
|
||||
'privacy_categories': {
|
||||
'direct_identifiers': [],
|
||||
'quasi_identifiers': [],
|
||||
'sensitive_attributes': []
|
||||
},
|
||||
'recommendations': []
|
||||
}
|
||||
|
||||
# Analyze each text column
|
||||
text_columns = df.select_dtypes(include=['object']).columns.tolist()
|
||||
|
||||
print(f"Analyzing {len(text_columns)} text columns...")
|
||||
|
||||
for column in text_columns:
|
||||
print(f" Analyzing '{column}'...", end=" ")
|
||||
|
||||
analysis = self.analyze_column(df[column], column)
|
||||
results['column_analysis'][column] = analysis
|
||||
|
||||
# Track high-risk columns
|
||||
if analysis['risk_score'] >= 0.5:
|
||||
results['overall_risk']['high_risk_columns'].append(column)
|
||||
|
||||
if analysis['pii_detected']:
|
||||
results['overall_risk']['pii_columns'].append(column)
|
||||
|
||||
# Categorize by privacy type
|
||||
for entity_type in analysis['entity_types']:
|
||||
if entity_type in self.PRIVACY_CATEGORIES['DIRECT_IDENTIFIER']:
|
||||
results['privacy_categories']['direct_identifiers'].append({
|
||||
'column': column,
|
||||
'entity': entity_type
|
||||
})
|
||||
elif entity_type in self.PRIVACY_CATEGORIES['QUASI_IDENTIFIER']:
|
||||
results['privacy_categories']['quasi_identifiers'].append({
|
||||
'column': column,
|
||||
'entity': entity_type
|
||||
})
|
||||
|
||||
print(f"✓ Risk: {analysis['risk_level']} ({analysis['risk_score']:.2f})")
|
||||
|
||||
# Calculate overall risk
|
||||
if results['column_analysis']:
|
||||
avg_risk = np.mean([
|
||||
col['risk_score']
|
||||
for col in results['column_analysis'].values()
|
||||
])
|
||||
results['overall_risk']['risk_score'] = round(avg_risk, 3)
|
||||
results['overall_risk']['risk_level'] = self._get_risk_level(avg_risk)
|
||||
|
||||
# Generate recommendations
|
||||
results['recommendations'] = self._generate_recommendations(results)
|
||||
|
||||
print("\n" + "="*70)
|
||||
print(f"✓ ANALYSIS COMPLETE - Overall Risk: {results['overall_risk']['risk_level']}")
|
||||
print("="*70 + "\n")
|
||||
|
||||
return results
|
||||
|
||||
def _generate_recommendations(self, results: Dict) -> List[str]:
|
||||
"""Generate GDPR-compliant recommendations"""
|
||||
recommendations = []
|
||||
|
||||
high_risk_cols = results['overall_risk']['high_risk_columns']
|
||||
direct_ids = results['privacy_categories']['direct_identifiers']
|
||||
|
||||
if direct_ids:
|
||||
recommendations.append(
|
||||
f"🔴 CRITICAL: {len(direct_ids)} direct identifiers found. "
|
||||
"Remove or hash these columns immediately (GDPR Art. 5)"
|
||||
)
|
||||
|
||||
if high_risk_cols:
|
||||
recommendations.append(
|
||||
f"⚠️ HIGH RISK: {len(high_risk_cols)} columns flagged. "
|
||||
"Apply anonymization techniques (GDPR Art. 32)"
|
||||
)
|
||||
|
||||
if results['privacy_categories']['quasi_identifiers']:
|
||||
recommendations.append(
|
||||
"📊 Quasi-identifiers detected. Consider k-anonymity or l-diversity"
|
||||
)
|
||||
|
||||
if not recommendations:
|
||||
recommendations.append("✓ No critical privacy risks detected. Dataset appears GDPR-compliant.")
|
||||
|
||||
return recommendations
|
||||
|
||||
|
||||
class TFIDFBiasAnalyzer:
|
||||
"""
|
||||
TF-IDF based Bias Analyzer for fast fairness assessment
|
||||
Detects demographic patterns and potential discrimination
|
||||
"""
|
||||
|
||||
# Protected attributes (GDPR special categories)
|
||||
PROTECTED_PATTERNS = {
|
||||
'race': re.compile(r'\b(african|asian|caucasian|hispanic|latino|black|white)\b', re.I),
|
||||
'gender': re.compile(r'\b(male|female|man|woman|boy|girl|transgender|non-binary)\b', re.I),
|
||||
'religion': re.compile(r'\b(christian|muslim|jewish|hindu|buddhist|atheist|religious)\b', re.I),
|
||||
'age': re.compile(r'\b(elderly|senior|young|teenager|minor|adult|aged)\b', re.I),
|
||||
'disability': re.compile(r'\b(disabled|handicapped|impaired|wheelchair|blind|deaf)\b', re.I),
|
||||
'nationality': re.compile(r'\b(american|british|indian|chinese|german|french|nationality)\b', re.I),
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize TF-IDF bias analyzer"""
|
||||
self.vectorizer = TfidfVectorizer(
|
||||
max_features=3000,
|
||||
ngram_range=(1, 2),
|
||||
min_df=1,
|
||||
stop_words='english'
|
||||
)
|
||||
|
||||
def detect_protected_attributes(self, text: str) -> Dict[str, List[str]]:
|
||||
"""
|
||||
Detect protected attributes in text
|
||||
|
||||
Args:
|
||||
text: Text to analyze
|
||||
|
||||
Returns:
|
||||
Dictionary of attribute_type -> matches
|
||||
"""
|
||||
detections = {}
|
||||
|
||||
for attr_type, pattern in self.PROTECTED_PATTERNS.items():
|
||||
matches = pattern.findall(text)
|
||||
if matches:
|
||||
detections[attr_type] = list(set([m.lower() for m in matches]))
|
||||
|
||||
return detections
|
||||
|
||||
def analyze_column_bias(self, series: pd.Series, column_name: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze column for potential bias indicators
|
||||
|
||||
Args:
|
||||
series: Pandas Series to analyze
|
||||
column_name: Name of the column
|
||||
|
||||
Returns:
|
||||
Bias analysis results
|
||||
"""
|
||||
text_samples = series.dropna().astype(str).head(1000).tolist()
|
||||
combined_text = " | ".join(text_samples[:100])
|
||||
|
||||
# Detect protected attributes
|
||||
protected_attrs = self.detect_protected_attributes(combined_text)
|
||||
|
||||
# Calculate bias score
|
||||
bias_score = len(protected_attrs) * 0.2 # 0.2 per category
|
||||
bias_score = min(bias_score, 1.0)
|
||||
|
||||
return {
|
||||
'column_name': column_name,
|
||||
'protected_attributes': list(protected_attrs.keys()),
|
||||
'attribute_values': protected_attrs,
|
||||
'bias_score': round(bias_score, 3),
|
||||
'bias_level': self._get_bias_level(bias_score),
|
||||
'gdpr_concern': len(protected_attrs) > 0 # Art. 9 special categories
|
||||
}
|
||||
|
||||
def _get_bias_level(self, bias_score: float) -> str:
|
||||
"""Convert bias score to categorical level"""
|
||||
if bias_score >= 0.6:
|
||||
return "HIGH"
|
||||
elif bias_score >= 0.3:
|
||||
return "MEDIUM"
|
||||
else:
|
||||
return "LOW"
|
||||
|
||||
def analyze_dataset(self, df: pd.DataFrame) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze entire dataset for bias
|
||||
|
||||
Args:
|
||||
df: DataFrame to analyze
|
||||
|
||||
Returns:
|
||||
Comprehensive bias analysis report
|
||||
"""
|
||||
print("\n" + "="*70)
|
||||
print("⚖️ TF-IDF BIAS ANALYSIS - GDPR ARTICLE 9 COMPLIANCE")
|
||||
print("="*70 + "\n")
|
||||
|
||||
results = {
|
||||
'metadata': {
|
||||
'total_rows': len(df),
|
||||
'total_columns': len(df.columns),
|
||||
'analysis_method': 'tfidf_pattern_matching'
|
||||
},
|
||||
'column_analysis': {},
|
||||
'overall_bias': {
|
||||
'bias_score': 0.0,
|
||||
'bias_level': 'LOW',
|
||||
'flagged_columns': [],
|
||||
'protected_categories_found': []
|
||||
},
|
||||
'gdpr_compliance': {
|
||||
'article_9_violations': [],
|
||||
'special_categories_detected': []
|
||||
},
|
||||
'recommendations': []
|
||||
}
|
||||
|
||||
# Analyze text columns
|
||||
text_columns = df.select_dtypes(include=['object']).columns.tolist()
|
||||
|
||||
print(f"Analyzing {len(text_columns)} columns for bias...")
|
||||
|
||||
for column in text_columns:
|
||||
print(f" Analyzing '{column}'...", end=" ")
|
||||
|
||||
analysis = self.analyze_column_bias(df[column], column)
|
||||
results['column_analysis'][column] = analysis
|
||||
|
||||
if analysis['bias_score'] >= 0.3:
|
||||
results['overall_bias']['flagged_columns'].append(column)
|
||||
|
||||
if analysis['gdpr_concern']:
|
||||
results['gdpr_compliance']['article_9_violations'].append({
|
||||
'column': column,
|
||||
'protected_attributes': analysis['protected_attributes']
|
||||
})
|
||||
|
||||
for attr in analysis['protected_attributes']:
|
||||
if attr not in results['overall_bias']['protected_categories_found']:
|
||||
results['overall_bias']['protected_categories_found'].append(attr)
|
||||
|
||||
print(f"✓ Bias: {analysis['bias_level']} ({analysis['bias_score']:.2f})")
|
||||
|
||||
# Calculate overall bias
|
||||
if results['column_analysis']:
|
||||
avg_bias = np.mean([
|
||||
col['bias_score']
|
||||
for col in results['column_analysis'].values()
|
||||
])
|
||||
results['overall_bias']['bias_score'] = round(avg_bias, 3)
|
||||
results['overall_bias']['bias_level'] = self._get_bias_level(avg_bias)
|
||||
|
||||
# Recommendations
|
||||
results['recommendations'] = self._generate_bias_recommendations(results)
|
||||
|
||||
print("\n" + "="*70)
|
||||
print(f"✓ BIAS ANALYSIS COMPLETE - Overall Bias: {results['overall_bias']['bias_level']}")
|
||||
print("="*70 + "\n")
|
||||
|
||||
return results
|
||||
|
||||
def _generate_bias_recommendations(self, results: Dict) -> List[str]:
|
||||
"""Generate bias mitigation recommendations"""
|
||||
recommendations = []
|
||||
|
||||
violations = results['gdpr_compliance']['article_9_violations']
|
||||
protected_cats = results['overall_bias']['protected_categories_found']
|
||||
|
||||
if violations:
|
||||
recommendations.append(
|
||||
f"🔴 GDPR Article 9 Violation: {len(violations)} columns contain special category data. "
|
||||
"Remove or obtain explicit consent before processing."
|
||||
)
|
||||
|
||||
if protected_cats:
|
||||
recommendations.append(
|
||||
f"⚠️ Protected attributes detected: {', '.join(protected_cats)}. "
|
||||
"Ensure model decisions don't rely on these features."
|
||||
)
|
||||
|
||||
if results['overall_bias']['bias_score'] >= 0.5:
|
||||
recommendations.append(
|
||||
"📊 High bias score detected. Apply bias mitigation techniques "
|
||||
"(reweighting, adversarial debiasing, fairness constraints)."
|
||||
)
|
||||
|
||||
if not recommendations:
|
||||
recommendations.append("✓ No significant bias indicators detected.")
|
||||
|
||||
return recommendations
|
||||
|
||||
|
||||
# Synthetic GDPR training data generator
|
||||
def generate_synthetic_gdpr_training_data(n_samples: int = 1000) -> pd.DataFrame:
|
||||
"""
|
||||
Generate synthetic training data for TF-IDF model
|
||||
Simulates GDPR-compliant and non-compliant text patterns
|
||||
"""
|
||||
print(f"\n📝 Generating {n_samples} synthetic GDPR training samples...")
|
||||
|
||||
pii_samples = [
|
||||
"john.doe@example.com", "jane.smith@company.com", "+1-555-123-4567",
|
||||
"123-45-6789", "4532-1234-5678-9012", "192.168.1.1",
|
||||
"https://example.com/profile", "12/31/2023", "90210"
|
||||
] * (n_samples // 27)
|
||||
|
||||
sensitive_samples = [
|
||||
"Patient has diabetes", "Employee salary $120,000", "Credit score 750",
|
||||
"African American male", "Muslim employee", "Wheelchair accessible"
|
||||
] * (n_samples // 18)
|
||||
|
||||
safe_samples = [
|
||||
"Product category", "Inventory count", "Temperature reading",
|
||||
"Anonymous feedback", "Aggregated statistics", "Public information"
|
||||
] * (n_samples // 18)
|
||||
|
||||
# Combine
|
||||
texts = pii_samples + sensitive_samples + safe_samples
|
||||
labels = (
|
||||
['PII'] * len(pii_samples) +
|
||||
['SENSITIVE'] * len(sensitive_samples) +
|
||||
['SAFE'] * len(safe_samples)
|
||||
)
|
||||
|
||||
df = pd.DataFrame({
|
||||
'text': texts[:n_samples],
|
||||
'label': labels[:n_samples]
|
||||
})
|
||||
|
||||
print(f"✓ Generated dataset: {df['label'].value_counts().to_dict()}")
|
||||
|
||||
return df
|
||||
260
ai_governance/unified_analyzer.py
Normal file
260
ai_governance/unified_analyzer.py
Normal file
@@ -0,0 +1,260 @@
|
||||
"""
|
||||
Unified Analysis API - Combines TF-IDF and Presidio
|
||||
Provides fast fallback with TF-IDF and deep analysis with Presidio
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from typing import Dict, Any, Optional, Literal
|
||||
import time
|
||||
|
||||
from ai_governance.tfidf_analyzer import TFIDFRiskAnalyzer, TFIDFBiasAnalyzer
|
||||
from ai_governance.risk_analyzer import RiskAnalyzer
|
||||
from ai_governance.bias_analyzer import BiasAnalyzer
|
||||
|
||||
|
||||
class UnifiedAnalyzer:
|
||||
"""
|
||||
Unified analyzer that combines TF-IDF (fast) with Presidio (accurate)
|
||||
Provides intelligent fallback and hybrid analysis modes
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
mode: Literal['fast', 'accurate', 'hybrid'] = 'hybrid',
|
||||
tfidf_model_path: Optional[str] = None
|
||||
):
|
||||
"""
|
||||
Initialize unified analyzer
|
||||
|
||||
Args:
|
||||
mode: Analysis mode
|
||||
- 'fast': TF-IDF only (20x faster)
|
||||
- 'accurate': Presidio only (most accurate)
|
||||
- 'hybrid': TF-IDF first, Presidio for high-risk (balanced)
|
||||
tfidf_model_path: Path to pre-trained TF-IDF model
|
||||
"""
|
||||
self.mode = mode
|
||||
|
||||
# Initialize TF-IDF analyzers (always available)
|
||||
print(f"\n🔧 Initializing Unified Analyzer (mode: {mode.upper()})...")
|
||||
|
||||
self.tfidf_risk = TFIDFRiskAnalyzer(model_path=tfidf_model_path)
|
||||
self.tfidf_bias = TFIDFBiasAnalyzer()
|
||||
|
||||
# Initialize Presidio analyzers (if needed)
|
||||
self.presidio_risk = None
|
||||
self.presidio_bias = None
|
||||
|
||||
if mode in ['accurate', 'hybrid']:
|
||||
try:
|
||||
self.presidio_risk = RiskAnalyzer(use_gpu=False) # CPU for compatibility
|
||||
self.presidio_bias = BiasAnalyzer()
|
||||
print("✓ Presidio analyzers initialized")
|
||||
except Exception as e:
|
||||
print(f"⚠️ Presidio not available: {e}")
|
||||
print(" Falling back to TF-IDF only mode")
|
||||
self.mode = 'fast'
|
||||
|
||||
print(f"✓ Unified Analyzer ready ({self.mode} mode)")
|
||||
|
||||
def analyze_risk(self, df: pd.DataFrame) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze privacy risks using selected mode
|
||||
|
||||
Args:
|
||||
df: DataFrame to analyze
|
||||
|
||||
Returns:
|
||||
Risk analysis results with timing info
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
if self.mode == 'fast':
|
||||
# TF-IDF only (fastest)
|
||||
results = self.tfidf_risk.analyze_dataset(df)
|
||||
results['analysis_method'] = 'tfidf'
|
||||
|
||||
elif self.mode == 'accurate':
|
||||
# Presidio only (most accurate)
|
||||
results = self.presidio_risk.analyze(df)
|
||||
results['analysis_method'] = 'presidio'
|
||||
|
||||
else: # hybrid
|
||||
# TF-IDF first for quick screening
|
||||
print("\n🔍 Phase 1: TF-IDF quick screening...")
|
||||
tfidf_results = self.tfidf_risk.analyze_dataset(df)
|
||||
|
||||
# Check if high-risk columns need deep analysis
|
||||
high_risk_cols = tfidf_results['overall_risk']['high_risk_columns']
|
||||
|
||||
if high_risk_cols:
|
||||
print(f"\n🔬 Phase 2: Presidio deep analysis on {len(high_risk_cols)} high-risk columns...")
|
||||
presidio_results = self.presidio_risk.analyze(df[high_risk_cols])
|
||||
|
||||
# Merge results
|
||||
results = self._merge_risk_results(tfidf_results, presidio_results)
|
||||
results['analysis_method'] = 'hybrid_tfidf_presidio'
|
||||
else:
|
||||
results = tfidf_results
|
||||
results['analysis_method'] = 'tfidf_only'
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
results['analysis_time_seconds'] = round(elapsed_time, 2)
|
||||
|
||||
return results
|
||||
|
||||
def analyze_bias(self, df: pd.DataFrame) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze bias using selected mode
|
||||
|
||||
Args:
|
||||
df: DataFrame to analyze
|
||||
|
||||
Returns:
|
||||
Bias analysis results with timing info
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
if self.mode == 'fast':
|
||||
# TF-IDF only
|
||||
results = self.tfidf_bias.analyze_dataset(df)
|
||||
results['analysis_method'] = 'tfidf'
|
||||
|
||||
elif self.mode == 'accurate':
|
||||
# Presidio-based
|
||||
results = self.presidio_bias.analyze(df)
|
||||
results['analysis_method'] = 'presidio'
|
||||
|
||||
else: # hybrid
|
||||
# Use TF-IDF for pattern matching
|
||||
tfidf_results = self.tfidf_bias.analyze_dataset(df)
|
||||
|
||||
# Use Presidio for statistical bias
|
||||
if self.presidio_bias:
|
||||
presidio_results = self.presidio_bias.analyze(df)
|
||||
results = self._merge_bias_results(tfidf_results, presidio_results)
|
||||
results['analysis_method'] = 'hybrid'
|
||||
else:
|
||||
results = tfidf_results
|
||||
results['analysis_method'] = 'tfidf_only'
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
results['analysis_time_seconds'] = round(elapsed_time, 2)
|
||||
|
||||
return results
|
||||
|
||||
def analyze_full(self, df: pd.DataFrame) -> Dict[str, Any]:
|
||||
"""
|
||||
Run complete risk + bias analysis
|
||||
|
||||
Args:
|
||||
df: DataFrame to analyze
|
||||
|
||||
Returns:
|
||||
Combined analysis results
|
||||
"""
|
||||
print("\n" + "="*70)
|
||||
print("🎯 UNIFIED AI GOVERNANCE ANALYSIS")
|
||||
print("="*70)
|
||||
|
||||
# Risk analysis
|
||||
print("\n📊 PRIVACY RISK ANALYSIS")
|
||||
risk_results = self.analyze_risk(df)
|
||||
|
||||
# Bias analysis
|
||||
print("\n⚖️ FAIRNESS & BIAS ANALYSIS")
|
||||
bias_results = self.analyze_bias(df)
|
||||
|
||||
# Combined results
|
||||
combined = {
|
||||
'analysis_mode': self.mode,
|
||||
'dataset_info': {
|
||||
'rows': len(df),
|
||||
'columns': len(df.columns)
|
||||
},
|
||||
'risk_analysis': risk_results,
|
||||
'bias_analysis': bias_results,
|
||||
'total_time_seconds': risk_results.get('analysis_time_seconds', 0) +
|
||||
bias_results.get('analysis_time_seconds', 0),
|
||||
'gdpr_compliance': self._assess_gdpr_compliance(risk_results, bias_results)
|
||||
}
|
||||
|
||||
print("\n" + "="*70)
|
||||
print(f"✅ ANALYSIS COMPLETE in {combined['total_time_seconds']:.2f}s")
|
||||
print("="*70)
|
||||
|
||||
return combined
|
||||
|
||||
def _merge_risk_results(self, tfidf_results: Dict, presidio_results: Dict) -> Dict:
|
||||
"""Merge TF-IDF and Presidio risk results"""
|
||||
merged = tfidf_results.copy()
|
||||
|
||||
# Update high-risk columns with Presidio details
|
||||
for col in tfidf_results['overall_risk']['high_risk_columns']:
|
||||
if col in presidio_results.get('privacy_risks', {}):
|
||||
merged['column_analysis'][col]['presidio_details'] = presidio_results['privacy_risks'][col]
|
||||
|
||||
return merged
|
||||
|
||||
def _merge_bias_results(self, tfidf_results: Dict, presidio_results: Dict) -> Dict:
|
||||
"""Merge TF-IDF and Presidio bias results"""
|
||||
merged = tfidf_results.copy()
|
||||
|
||||
# Add statistical bias metrics from Presidio
|
||||
if 'bias_metrics' in presidio_results:
|
||||
merged['statistical_bias'] = presidio_results['bias_metrics']
|
||||
|
||||
return merged
|
||||
|
||||
def _assess_gdpr_compliance(self, risk_results: Dict, bias_results: Dict) -> Dict:
|
||||
"""Assess overall GDPR compliance"""
|
||||
compliance = {
|
||||
'compliant': True,
|
||||
'violations': [],
|
||||
'warnings': [],
|
||||
'articles_applicable': []
|
||||
}
|
||||
|
||||
# Check risk results
|
||||
if risk_results.get('overall_risk', {}).get('risk_level') in ['HIGH', 'CRITICAL']:
|
||||
compliance['compliant'] = False
|
||||
compliance['violations'].append("High privacy risk detected (GDPR Art. 5)")
|
||||
compliance['articles_applicable'].append("Art. 5 - Data minimization")
|
||||
|
||||
direct_ids = len(risk_results.get('privacy_categories', {}).get('direct_identifiers', []))
|
||||
if direct_ids > 0:
|
||||
compliance['violations'].append(f"{direct_ids} direct identifiers require protection (GDPR Art. 32)")
|
||||
compliance['articles_applicable'].append("Art. 32 - Security of processing")
|
||||
|
||||
# Check bias results
|
||||
article9_violations = bias_results.get('gdpr_compliance', {}).get('article_9_violations', [])
|
||||
if article9_violations:
|
||||
compliance['compliant'] = False
|
||||
compliance['violations'].append(f"{len(article9_violations)} special category violations (GDPR Art. 9)")
|
||||
compliance['articles_applicable'].append("Art. 9 - Special categories of personal data")
|
||||
|
||||
if compliance['compliant']:
|
||||
compliance['status'] = "✅ GDPR Compliant"
|
||||
else:
|
||||
compliance['status'] = "❌ GDPR Non-Compliant"
|
||||
|
||||
return compliance
|
||||
|
||||
|
||||
# Convenience functions for API endpoints
|
||||
def quick_risk_check(df: pd.DataFrame) -> Dict[str, Any]:
|
||||
"""Fast risk check using TF-IDF (for API endpoints)"""
|
||||
analyzer = UnifiedAnalyzer(mode='fast')
|
||||
return analyzer.analyze_risk(df)
|
||||
|
||||
|
||||
def deep_risk_analysis(df: pd.DataFrame) -> Dict[str, Any]:
|
||||
"""Accurate risk analysis using Presidio (for detailed reports)"""
|
||||
analyzer = UnifiedAnalyzer(mode='accurate')
|
||||
return analyzer.analyze_risk(df)
|
||||
|
||||
|
||||
def hybrid_analysis(df: pd.DataFrame) -> Dict[str, Any]:
|
||||
"""Balanced hybrid analysis (recommended)"""
|
||||
analyzer = UnifiedAnalyzer(mode='hybrid')
|
||||
return analyzer.analyze_full(df)
|
||||
@@ -3,14 +3,15 @@ Data Cleaning Router
|
||||
Handles PII detection and anonymization endpoints
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, File, UploadFile, HTTPException
|
||||
from fastapi import APIRouter, File, UploadFile, HTTPException, Form
|
||||
from fastapi.responses import JSONResponse
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import io
|
||||
import os
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
# Import cleaning module
|
||||
import sys
|
||||
@@ -36,11 +37,16 @@ def convert_to_serializable(obj):
|
||||
|
||||
|
||||
@router.post("/clean")
|
||||
async def clean_dataset(file: UploadFile = File(...)):
|
||||
async def clean_dataset(
|
||||
file: UploadFile = File(...),
|
||||
custom_strategies: Optional[str] = Form(None)
|
||||
):
|
||||
"""
|
||||
Clean uploaded dataset - detect and anonymize PII
|
||||
|
||||
- **file**: CSV file to clean
|
||||
- **custom_strategies**: Optional JSON string mapping column names to strategy choices
|
||||
Format: {"column_name": {"enabled": true, "strategy": "HASHING"}}
|
||||
|
||||
Returns:
|
||||
- Cleaned dataset statistics
|
||||
@@ -53,6 +59,15 @@ async def clean_dataset(file: UploadFile = File(...)):
|
||||
if not file.filename.endswith('.csv'):
|
||||
raise HTTPException(status_code=400, detail="Only CSV files are supported")
|
||||
|
||||
# Parse custom strategies if provided
|
||||
strategy_map = None
|
||||
if custom_strategies:
|
||||
try:
|
||||
strategy_map = json.loads(custom_strategies)
|
||||
print(f"Using custom strategies for {len(strategy_map)} columns")
|
||||
except json.JSONDecodeError:
|
||||
raise HTTPException(status_code=400, detail="Invalid custom_strategies JSON format")
|
||||
|
||||
try:
|
||||
# Read uploaded file
|
||||
contents = await file.read()
|
||||
@@ -65,9 +80,24 @@ async def clean_dataset(file: UploadFile = File(...)):
|
||||
print(f"Cleaning dataset: {file.filename} ({len(df)} rows, {len(df.columns)} columns)")
|
||||
cleaner = DataCleaner(df, use_gpu=True)
|
||||
|
||||
# If custom strategies provided, filter and apply only enabled columns
|
||||
risky_features_to_clean = None
|
||||
if strategy_map:
|
||||
# Only clean columns that are enabled in the strategy map
|
||||
enabled_columns = [col for col, config in strategy_map.items() if config.get('enabled', True)]
|
||||
risky_features_to_clean = enabled_columns
|
||||
print(f" Using custom strategies for {len(enabled_columns)} enabled columns")
|
||||
|
||||
# Store the strategy choices for the cleaner to use
|
||||
cleaner.custom_strategy_map = {
|
||||
col: config['strategy']
|
||||
for col, config in strategy_map.items()
|
||||
if config.get('enabled', True)
|
||||
}
|
||||
|
||||
# Run cleaning (non-interactive mode for API)
|
||||
cleaned_df, audit_report = cleaner.clean(
|
||||
risky_features=None, # Auto-detect
|
||||
risky_features=risky_features_to_clean, # Use custom list if provided, otherwise auto-detect
|
||||
interactive=False, # No user prompts in API mode
|
||||
scan_all_cells=True
|
||||
)
|
||||
|
||||
@@ -157,6 +157,7 @@ class DataCleaner:
|
||||
self.audit_log = []
|
||||
self.cleaning_actions = {}
|
||||
self.use_gpu = use_gpu and CUDA_AVAILABLE
|
||||
self.custom_strategy_map = {} # Store custom anonymization strategies per column
|
||||
|
||||
# Display GPU info
|
||||
self._display_gpu_info()
|
||||
@@ -549,32 +550,40 @@ class DataCleaner:
|
||||
Returns:
|
||||
Modified DataFrame
|
||||
"""
|
||||
# Determine strategies for each entity type
|
||||
strategies = {}
|
||||
needs_prompt = []
|
||||
# Check if custom strategy is specified for this column
|
||||
if column in self.custom_strategy_map:
|
||||
custom_strategy = self.custom_strategy_map[column]
|
||||
print(f" 🎯 Using custom strategy '{custom_strategy}' for column '{column}'")
|
||||
|
||||
for detection in detections:
|
||||
entity_type = detection['entity_type']
|
||||
confidence = detection['avg_confidence']
|
||||
default_strategy = self.config.STRATEGY_MAP.get(entity_type)
|
||||
# Apply the custom strategy to all entities in this column
|
||||
strategies = {detection['entity_type']: custom_strategy for detection in detections}
|
||||
else:
|
||||
# Determine strategies for each entity type (original logic)
|
||||
strategies = {}
|
||||
needs_prompt = []
|
||||
|
||||
# Decide if we need to prompt user
|
||||
if confidence < self.config.MEDIUM_CONFIDENCE or default_strategy is None:
|
||||
needs_prompt.append(detection)
|
||||
else:
|
||||
strategies[entity_type] = default_strategy
|
||||
for detection in detections:
|
||||
entity_type = detection['entity_type']
|
||||
confidence = detection['avg_confidence']
|
||||
default_strategy = self.config.STRATEGY_MAP.get(entity_type)
|
||||
|
||||
# Interactive prompts for ambiguous cases
|
||||
if interactive and needs_prompt:
|
||||
print(f"\n ⚠️ Column '{column}' has ambiguous PII detections:")
|
||||
for i, detection in enumerate(needs_prompt, 1):
|
||||
print(f" {i}. {detection['entity_type']} "
|
||||
f"(confidence: {detection['avg_confidence']:.2f}, "
|
||||
f"count: {detection['count']})")
|
||||
# Decide if we need to prompt user
|
||||
if confidence < self.config.MEDIUM_CONFIDENCE or default_strategy is None:
|
||||
needs_prompt.append(detection)
|
||||
else:
|
||||
strategies[entity_type] = default_strategy
|
||||
|
||||
strategy = self._prompt_user_strategy(column, needs_prompt)
|
||||
for detection in needs_prompt:
|
||||
strategies[detection['entity_type']] = strategy
|
||||
# Interactive prompts for ambiguous cases
|
||||
if interactive and needs_prompt:
|
||||
print(f"\n ⚠️ Column '{column}' has ambiguous PII detections:")
|
||||
for i, detection in enumerate(needs_prompt, 1):
|
||||
print(f" {i}. {detection['entity_type']} "
|
||||
f"(confidence: {detection['avg_confidence']:.2f}, "
|
||||
f"count: {detection['count']})")
|
||||
|
||||
strategy = self._prompt_user_strategy(column, needs_prompt)
|
||||
for detection in needs_prompt:
|
||||
strategies[detection['entity_type']] = strategy
|
||||
|
||||
# Apply strategies
|
||||
action_log = {
|
||||
|
||||
@@ -40,6 +40,9 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
|
||||
const [cleanResult, setCleanResult] = useState<CleanResponse | null>(null);
|
||||
const [piiDetectionResult, setPIIDetectionResult] = useState<DetectPIIResponse | null>(null);
|
||||
|
||||
// Strategy selection state
|
||||
const [selectedStrategies, setSelectedStrategies] = useState<Record<string, { enabled: boolean; strategy: string }>>({});
|
||||
|
||||
const reset = () => {
|
||||
setFileMeta(null);
|
||||
setUploadedFile(null);
|
||||
@@ -86,6 +89,17 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
|
||||
try {
|
||||
const result = await detectPII(uploadedFile);
|
||||
setPIIDetectionResult(result);
|
||||
|
||||
// Initialize strategy selections with recommended strategies
|
||||
const initialStrategies: Record<string, { enabled: boolean; strategy: string }> = {};
|
||||
result.risky_features.forEach(feature => {
|
||||
initialStrategies[feature.column] = {
|
||||
enabled: true, // All enabled by default
|
||||
strategy: feature.recommended_strategy
|
||||
};
|
||||
});
|
||||
setSelectedStrategies(initialStrategies);
|
||||
|
||||
setProgressLabel("PII detection complete!");
|
||||
} catch (err: any) {
|
||||
setError(err.message || "PII detection failed");
|
||||
@@ -105,7 +119,8 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
|
||||
setProgressLabel("Cleaning dataset...");
|
||||
|
||||
try {
|
||||
const result = await cleanDataset(uploadedFile);
|
||||
// Pass the selected strategies to the API
|
||||
const result = await cleanDataset(uploadedFile, selectedStrategies);
|
||||
setCleanResult(result);
|
||||
setProgressLabel("Cleaning complete!");
|
||||
} catch (err: any) {
|
||||
@@ -1873,7 +1888,63 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
|
||||
</div>
|
||||
|
||||
{/* Risky Features List */}
|
||||
<div className="space-y-3">
|
||||
<div className="space-y-4">
|
||||
{/* Bulk Selection Controls */}
|
||||
<div className="flex items-center justify-between p-3 bg-slate-100 rounded-lg border border-slate-300">
|
||||
<div className="text-sm font-semibold text-slate-700">
|
||||
Bulk Actions:
|
||||
</div>
|
||||
<div className="flex gap-2">
|
||||
<button
|
||||
onClick={() => {
|
||||
const allEnabled: Record<string, { enabled: boolean; strategy: string }> = {};
|
||||
piiDetectionResult.risky_features.forEach(feature => {
|
||||
allEnabled[feature.column] = {
|
||||
enabled: true,
|
||||
strategy: selectedStrategies[feature.column]?.strategy ?? feature.recommended_strategy
|
||||
};
|
||||
});
|
||||
setSelectedStrategies(allEnabled);
|
||||
}}
|
||||
className="px-3 py-1 bg-green-600 text-white text-xs font-semibold rounded hover:bg-green-500"
|
||||
>
|
||||
✓ Enable All
|
||||
</button>
|
||||
<button
|
||||
onClick={() => {
|
||||
const allDisabled: Record<string, { enabled: boolean; strategy: string }> = {};
|
||||
piiDetectionResult.risky_features.forEach(feature => {
|
||||
allDisabled[feature.column] = {
|
||||
enabled: false,
|
||||
strategy: selectedStrategies[feature.column]?.strategy ?? feature.recommended_strategy
|
||||
};
|
||||
});
|
||||
setSelectedStrategies(allDisabled);
|
||||
}}
|
||||
className="px-3 py-1 bg-slate-600 text-white text-xs font-semibold rounded hover:bg-slate-500"
|
||||
>
|
||||
✗ Disable All
|
||||
</button>
|
||||
<button
|
||||
onClick={() => {
|
||||
const reset: Record<string, { enabled: boolean; strategy: string }> = {};
|
||||
piiDetectionResult.risky_features.forEach(feature => {
|
||||
reset[feature.column] = {
|
||||
enabled: true,
|
||||
strategy: feature.recommended_strategy
|
||||
};
|
||||
});
|
||||
setSelectedStrategies(reset);
|
||||
}}
|
||||
className="px-3 py-1 bg-blue-600 text-white text-xs font-semibold rounded hover:bg-blue-500"
|
||||
>
|
||||
↻ Reset to Recommended
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Individual Feature Cards */}
|
||||
<div className="space-y-3">
|
||||
{piiDetectionResult.risky_features.map((feature, idx) => {
|
||||
const riskColor =
|
||||
feature.risk_level === 'HIGH' ? 'red' :
|
||||
@@ -1885,23 +1956,43 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
|
||||
feature.risk_level === 'MEDIUM' ? 'bg-orange-50 border-orange-300' :
|
||||
feature.risk_level === 'LOW' ? 'bg-yellow-50 border-yellow-300' : 'bg-gray-50 border-gray-300';
|
||||
|
||||
const isEnabled = selectedStrategies[feature.column]?.enabled ?? true;
|
||||
const selectedStrategy = selectedStrategies[feature.column]?.strategy ?? feature.recommended_strategy;
|
||||
|
||||
return (
|
||||
<div key={idx} className={`p-5 rounded-xl border-2 ${bgColor}`}>
|
||||
{/* Header */}
|
||||
<div key={idx} className={`p-5 rounded-xl border-2 ${bgColor} ${!isEnabled ? 'opacity-60' : ''}`}>
|
||||
{/* Header with Checkbox */}
|
||||
<div className="flex items-start justify-between mb-3">
|
||||
<div className="flex-1">
|
||||
<div className="flex items-center gap-3 mb-2">
|
||||
<span className={`px-3 py-1 bg-${riskColor}-600 text-white text-xs font-bold rounded-full`}>
|
||||
{feature.risk_level} RISK
|
||||
</span>
|
||||
<span className="font-mono font-bold text-lg text-slate-800">{feature.column}</span>
|
||||
</div>
|
||||
<div className="text-sm text-slate-700">
|
||||
<span className="font-semibold">Detected:</span> {feature.entity_type}
|
||||
<span className="mx-2">•</span>
|
||||
<span className="font-semibold">Confidence:</span> {(feature.confidence * 100).toFixed(1)}%
|
||||
<span className="mx-2">•</span>
|
||||
<span className="font-semibold">Occurrences:</span> {feature.detection_count}
|
||||
<div className="flex items-start gap-3 flex-1">
|
||||
<input
|
||||
type="checkbox"
|
||||
checked={isEnabled}
|
||||
onChange={(e) => {
|
||||
setSelectedStrategies(prev => ({
|
||||
...prev,
|
||||
[feature.column]: {
|
||||
...prev[feature.column],
|
||||
enabled: e.target.checked,
|
||||
strategy: prev[feature.column]?.strategy ?? feature.recommended_strategy
|
||||
}
|
||||
}));
|
||||
}}
|
||||
className="mt-1 w-5 h-5 text-green-600 rounded focus:ring-2 focus:ring-green-500 cursor-pointer"
|
||||
/>
|
||||
<div className="flex-1">
|
||||
<div className="flex items-center gap-3 mb-2">
|
||||
<span className={`px-3 py-1 bg-${riskColor}-600 text-white text-xs font-bold rounded-full`}>
|
||||
{feature.risk_level} RISK
|
||||
</span>
|
||||
<span className="font-mono font-bold text-lg text-slate-800">{feature.column}</span>
|
||||
</div>
|
||||
<div className="text-sm text-slate-700">
|
||||
<span className="font-semibold">Detected:</span> {feature.entity_type}
|
||||
<span className="mx-2">•</span>
|
||||
<span className="font-semibold">Confidence:</span> {(feature.confidence * 100).toFixed(1)}%
|
||||
<span className="mx-2">•</span>
|
||||
<span className="font-semibold">Occurrences:</span> {feature.detection_count}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
@@ -1929,75 +2020,126 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Recommended Strategy */}
|
||||
<div className="p-4 bg-white rounded-lg border-2 border-green-300">
|
||||
{/* Strategy Selection */}
|
||||
<div className={`p-4 bg-white rounded-lg border-2 ${isEnabled ? 'border-green-300' : 'border-slate-300'}`}>
|
||||
<div className="flex items-start gap-3">
|
||||
<div className="flex-1">
|
||||
<div className="text-xs font-semibold text-green-700 mb-1">✓ RECOMMENDED STRATEGY</div>
|
||||
<div className="font-bold text-lg text-slate-900">{feature.recommended_strategy}</div>
|
||||
<div className="text-sm text-slate-700 mt-1">{feature.strategy_description}</div>
|
||||
<div className="mt-2 flex gap-4 text-xs text-slate-600">
|
||||
<div>
|
||||
<strong>Reversible:</strong> {feature.reversible ? 'Yes' : 'No'}
|
||||
</div>
|
||||
<div>
|
||||
<strong>Use Cases:</strong> {feature.use_cases.join(', ')}
|
||||
</div>
|
||||
<div className="text-xs font-semibold text-slate-700 mb-2">
|
||||
{isEnabled ? '✓ SELECT ANONYMIZATION STRATEGY' : '⚠️ STRATEGY DISABLED'}
|
||||
</div>
|
||||
</div>
|
||||
<button
|
||||
className="px-4 py-2 bg-green-600 text-white text-sm font-semibold rounded-lg hover:bg-green-500"
|
||||
onClick={() => alert(`Apply ${feature.recommended_strategy} to ${feature.column}`)}
|
||||
>
|
||||
Apply
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
<select
|
||||
value={selectedStrategy}
|
||||
onChange={(e) => {
|
||||
setSelectedStrategies(prev => ({
|
||||
...prev,
|
||||
[feature.column]: {
|
||||
enabled: isEnabled,
|
||||
strategy: e.target.value
|
||||
}
|
||||
}));
|
||||
}}
|
||||
disabled={!isEnabled}
|
||||
className="w-full px-3 py-2 border-2 border-slate-300 rounded-lg text-sm font-semibold focus:outline-none focus:ring-2 focus:ring-green-500 disabled:opacity-50 disabled:cursor-not-allowed"
|
||||
>
|
||||
{Object.keys(piiDetectionResult.available_strategies).map(strategy => (
|
||||
<option key={strategy} value={strategy}>
|
||||
{strategy}
|
||||
</option>
|
||||
))}
|
||||
</select>
|
||||
|
||||
{/* Alternative Strategies */}
|
||||
<details className="mt-3">
|
||||
<summary className="text-xs font-semibold text-slate-600 cursor-pointer hover:text-slate-800">
|
||||
View Alternative Strategies
|
||||
</summary>
|
||||
<div className="mt-2 grid grid-cols-1 md:grid-cols-2 gap-2">
|
||||
{Object.entries(piiDetectionResult.available_strategies)
|
||||
.filter(([strategy]) => strategy !== feature.recommended_strategy)
|
||||
.map(([strategy, details]: [string, any]) => (
|
||||
<div key={strategy} className="p-3 bg-white rounded border border-slate-200 hover:border-slate-400">
|
||||
<div className="font-semibold text-sm text-slate-800">{strategy}</div>
|
||||
<div className="text-xs text-slate-600 mt-1">{details.description}</div>
|
||||
<div className="mt-2 flex items-center justify-between">
|
||||
<span className={`px-2 py-0.5 text-xs rounded ${
|
||||
details.risk_level === 'HIGH' ? 'bg-red-100 text-red-800' :
|
||||
details.risk_level === 'MEDIUM' ? 'bg-orange-100 text-orange-800' :
|
||||
'bg-yellow-100 text-yellow-800'
|
||||
}`}>
|
||||
{details.risk_level} Risk
|
||||
</span>
|
||||
<button
|
||||
className="px-2 py-1 bg-blue-600 text-white text-xs rounded hover:bg-blue-500"
|
||||
onClick={() => alert(`Apply ${strategy} to ${feature.column}`)}
|
||||
>
|
||||
Use This
|
||||
</button>
|
||||
{/* Show selected strategy description */}
|
||||
{isEnabled && piiDetectionResult.available_strategies[selectedStrategy] && (
|
||||
<div className="mt-3 p-3 bg-slate-50 rounded-lg border border-slate-200">
|
||||
<div className="text-sm text-slate-700 mb-2">
|
||||
{piiDetectionResult.available_strategies[selectedStrategy].description}
|
||||
</div>
|
||||
<div className="flex gap-4 text-xs text-slate-600">
|
||||
<div>
|
||||
<strong>Risk Level:</strong>{' '}
|
||||
<span className={`px-2 py-0.5 rounded ${
|
||||
piiDetectionResult.available_strategies[selectedStrategy].risk_level === 'HIGH' ? 'bg-red-100 text-red-800' :
|
||||
piiDetectionResult.available_strategies[selectedStrategy].risk_level === 'MEDIUM' ? 'bg-orange-100 text-orange-800' :
|
||||
'bg-yellow-100 text-yellow-800'
|
||||
}`}>
|
||||
{piiDetectionResult.available_strategies[selectedStrategy].risk_level}
|
||||
</span>
|
||||
</div>
|
||||
<div>
|
||||
<strong>Reversible:</strong> {piiDetectionResult.available_strategies[selectedStrategy].reversible ? 'Yes' : 'No'}
|
||||
</div>
|
||||
</div>
|
||||
<div className="mt-2 text-xs text-slate-600">
|
||||
<strong>Use Cases:</strong> {piiDetectionResult.available_strategies[selectedStrategy].use_cases.join(', ')}
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
</details>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Apply All Button */}
|
||||
<div className="sticky bottom-0 p-4 bg-gradient-to-t from-white via-white to-transparent">
|
||||
{/* Strategy Summary */}
|
||||
<div className="mb-4 p-4 bg-blue-50 rounded-lg border-2 border-blue-200">
|
||||
<div className="text-sm font-semibold text-blue-900 mb-2">
|
||||
📋 Selected Strategies Summary
|
||||
</div>
|
||||
<div className="grid grid-cols-2 md:grid-cols-3 gap-2 text-xs">
|
||||
<div className="p-2 bg-white rounded border border-blue-200">
|
||||
<div className="font-semibold text-slate-700">Total Features:</div>
|
||||
<div className="text-2xl font-bold text-blue-900">
|
||||
{piiDetectionResult.risky_features.length}
|
||||
</div>
|
||||
</div>
|
||||
<div className="p-2 bg-white rounded border border-green-200">
|
||||
<div className="font-semibold text-slate-700">Enabled:</div>
|
||||
<div className="text-2xl font-bold text-green-900">
|
||||
{Object.values(selectedStrategies).filter(s => s.enabled).length}
|
||||
</div>
|
||||
</div>
|
||||
<div className="p-2 bg-white rounded border border-slate-200">
|
||||
<div className="font-semibold text-slate-700">Disabled:</div>
|
||||
<div className="text-2xl font-bold text-slate-900">
|
||||
{Object.values(selectedStrategies).filter(s => !s.enabled).length}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div className="mt-3 p-3 bg-white rounded-lg border border-blue-200">
|
||||
<div className="text-xs font-semibold text-slate-600 mb-2">Enabled Strategies:</div>
|
||||
<div className="flex flex-wrap gap-2">
|
||||
{Object.entries(selectedStrategies)
|
||||
.filter(([_, config]) => config.enabled)
|
||||
.map(([column, config]) => (
|
||||
<div key={column} className="px-2 py-1 bg-blue-100 text-blue-900 rounded text-xs font-medium border border-blue-300">
|
||||
<strong>{column}:</strong> {config.strategy}
|
||||
</div>
|
||||
))}
|
||||
{Object.values(selectedStrategies).filter(s => s.enabled).length === 0 && (
|
||||
<div className="text-xs text-slate-500 italic">No strategies enabled</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<button
|
||||
className="w-full py-3 bg-green-600 text-white font-bold rounded-lg hover:bg-green-500 shadow-lg"
|
||||
onClick={() => alert('Apply all recommended strategies and clean dataset')}
|
||||
className="w-full py-3 bg-green-600 text-white font-bold rounded-lg hover:bg-green-500 shadow-lg disabled:opacity-50 disabled:cursor-not-allowed transition-all"
|
||||
disabled={isProcessing || !uploadedFile || Object.values(selectedStrategies).filter(s => s.enabled).length === 0}
|
||||
onClick={handleClean}
|
||||
>
|
||||
✓ Apply All Recommended Strategies & Clean Dataset
|
||||
{isProcessing ? '⏳ Processing...' :
|
||||
Object.values(selectedStrategies).filter(s => s.enabled).length === 0 ?
|
||||
'⚠️ Enable at least one strategy to clean' :
|
||||
`✓ Apply ${Object.values(selectedStrategies).filter(s => s.enabled).length} Selected Strategies & Clean Dataset`}
|
||||
</button>
|
||||
<div className="mt-2 text-xs text-center text-slate-500">
|
||||
Note: Only enabled features will be anonymized
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
) : (
|
||||
|
||||
@@ -139,10 +139,18 @@ export async function analyzeDataset(file: File): Promise<AnalyzeResponse> {
|
||||
/**
|
||||
* Clean dataset - detect and anonymize PII
|
||||
*/
|
||||
export async function cleanDataset(file: File): Promise<CleanResponse> {
|
||||
export async function cleanDataset(
|
||||
file: File,
|
||||
customStrategies?: Record<string, { enabled: boolean; strategy: string }>
|
||||
): Promise<CleanResponse> {
|
||||
const formData = new FormData();
|
||||
formData.append('file', file);
|
||||
|
||||
// Add custom strategies if provided
|
||||
if (customStrategies) {
|
||||
formData.append('custom_strategies', JSON.stringify(customStrategies));
|
||||
}
|
||||
|
||||
const response = await fetch(`${API_BASE_URL}/api/clean`, {
|
||||
method: 'POST',
|
||||
body: formData,
|
||||
|
||||
Reference in New Issue
Block a user