Merge pull request #6 from dionjoshualobo/main

Blah Blah
2026-02-07 22:18:59 +00:00 · 2025-11-07 21:06:07 +05:30
parent 660f716813 71206d5499
commit 019f40c378
9 changed files with 629 additions and 36 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -74,4 +74,4 @@ frontend/nordic-privacy-ai/.next/
 frontend/nordic-privacy-ai/out/
 frontend/nordic-privacy-ai/node_modules/

-Data
+Datamain.py
--- a/GDPRArticles.pdf
+++ b/GDPRArticles.pdf
--- a/api/main.py
+++ b/api/main.py
@@ -8,7 +8,7 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 import os

-from api.routers import analyze, clean, discovery
+from api.routers import analyze, clean, discovery, detect_pii

 # Create FastAPI app
 app = FastAPI(
@@ -37,6 +37,7 @@ app.mount("/reports", StaticFiles(directory=reports_dir), name="reports")
 # Include routers
 app.include_router(analyze.router, prefix="/api", tags=["AI Governance"])
 app.include_router(clean.router, prefix="/api", tags=["Data Cleaning"])
+app.include_router(detect_pii.router, prefix="/api", tags=["PII Detection"])
 app.include_router(discovery.router, prefix="/api", tags=["Discover sources"])

@app.get("/")
--- a/api/routers/detect_pii.py
+++ b/api/routers/detect_pii.py
@@ -0,0 +1,224 @@
+"""
+PII Detection Router
+Detects risky features WITHOUT anonymizing them
+Returns risk classification for user review
+"""
+
+from fastapi import APIRouter, File, UploadFile, HTTPException
+from fastapi.responses import JSONResponse
+import pandas as pd
+import numpy as np
+import io
+import os
+import sys
+from typing import Dict, Any, List
+
+# Import cleaning module
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+from data_cleaning.cleaner import DataCleaner
+from data_cleaning.config import (
+    ENTITY_STRATEGY_MAP, 
+    STRATEGIES, 
+    GDPR_COMPLIANCE,
+    COLUMN_CONTEXT_FILTERS,
+    EXCLUSION_PATTERNS,
+    get_strategy_for_entity,
+    get_risk_level
+)
+
+router = APIRouter()
+
+
+def convert_to_serializable(obj):
+    """Convert numpy/pandas types to native Python types for JSON serialization"""
+    if isinstance(obj, (np.integer, np.int64, np.int32)):
+        return int(obj)
+    elif isinstance(obj, (np.floating, np.float64, np.float32)):
+        return float(obj)
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, dict):
+        return {key: convert_to_serializable(value) for key, value in obj.items()}
+    elif isinstance(obj, list):
+        return [convert_to_serializable(item) for item in obj]
+    return obj
+
+
+@router.post("/detect-pii")
+async def detect_pii(file: UploadFile = File(...)):
+    """
+    Detect PII in uploaded file WITHOUT anonymizing
+    
+    - **file**: CSV, JSON, or TXT file to analyze for PII
+    
+    Returns:
+        - List of risky features with severity and recommended strategies
+        - Detection confidence scores
+        - GDPR article references
+        - Example values for review
+    """
+    
+    try:
+        # Read uploaded file
+        contents = await file.read()
+        file_extension = os.path.splitext(file.filename)[1].lower()
+        
+        # Determine file type and parse accordingly
+        if file_extension == '.csv':
+            df = pd.read_csv(io.BytesIO(contents))
+            file_type = 'csv'
+        elif file_extension == '.json':
+            df = pd.read_json(io.BytesIO(contents))
+            file_type = 'json'
+        elif file_extension in ['.txt', '.text']:
+            # For plain text, create a single-column dataframe
+            text_content = contents.decode('utf-8', errors='ignore')
+            # Split into lines for better granularity
+            lines = [line.strip() for line in text_content.split('\n') if line.strip()]
+            df = pd.DataFrame({'text_content': lines})
+            file_type = 'text'
+        else:
+            # Try to auto-detect format
+            try:
+                # Try CSV first
+                df = pd.read_csv(io.BytesIO(contents))
+                file_type = 'csv'
+            except:
+                try:
+                    # Try JSON
+                    df = pd.read_json(io.BytesIO(contents))
+                    file_type = 'json'
+                except:
+                    # Fall back to plain text
+                    text_content = contents.decode('utf-8', errors='ignore')
+                    lines = [line.strip() for line in text_content.split('\n') if line.strip()]
+                    df = pd.DataFrame({'text_content': lines})
+                    file_type = 'text'
+        
+        if df.empty:
+            raise HTTPException(status_code=400, detail="Uploaded file is empty")
+        
+        print(f"Detecting PII in: {file.filename} ({file_type} format, {len(df)} rows, {len(df.columns)} columns)")
+        
+        # Initialize Data Cleaner (with GPU if available)
+        cleaner = DataCleaner(df, use_gpu=True)
+        
+        # Detect PII without cleaning
+        pii_detections = cleaner._detect_pii(
+            df=df,
+            risky_columns=None,  # Scan all columns
+            scan_all_cells=True
+        )
+        
+        # Classify by risk level
+        risk_classification = cleaner._classify_risk(pii_detections)
+        
+        # Build response with detailed feature information
+        risky_features = []
+        
+        for risk_level in ['HIGH', 'MEDIUM', 'LOW', 'UNKNOWN']:
+            detections = risk_classification[risk_level]
+            
+            for column, entities in detections.items():
+                for entity_info in entities:
+                    entity_type = entity_info['entity_type']
+                    strategy = entity_info['strategy']
+                    
+                    # Get example values from the column (first 3 non-null)
+                    sample_values = df[column].dropna().head(5).astype(str).tolist()
+                    
+                    # Get GDPR article
+                    gdpr_article = GDPR_COMPLIANCE.get(entity_type, 'Not classified')
+                    
+                    # Get strategy details
+                    strategy_details = STRATEGIES.get(strategy, {})
+                    
+                    risky_features.append({
+                        'column': column,
+                        'entity_type': entity_type,
+                        'risk_level': risk_level,
+                        'confidence': float(entity_info['confidence']),
+                        'detection_count': int(entity_info['count']),
+                        'recommended_strategy': strategy,
+                        'strategy_description': strategy_details.get('description', ''),
+                        'reversible': strategy_details.get('reversible', False),
+                        'use_cases': strategy_details.get('use_cases', []),
+                        'gdpr_article': gdpr_article,
+                        'sample_values': sample_values[:3],  # Show 3 examples
+                        'explanation': _generate_risk_explanation(entity_type, risk_level, strategy)
+                    })
+        
+        # Sort by risk level (HIGH -> MEDIUM -> LOW)
+        risk_order = {'HIGH': 0, 'MEDIUM': 1, 'LOW': 2, 'UNKNOWN': 3}
+        risky_features.sort(key=lambda x: (risk_order[x['risk_level']], x['column']))
+        
+        # Prepare summary statistics
+        summary = {
+            'total_columns_scanned': len(df.columns),
+            'risky_columns_found': len(set(f['column'] for f in risky_features)),
+            'high_risk_count': sum(1 for f in risky_features if f['risk_level'] == 'HIGH'),
+            'medium_risk_count': sum(1 for f in risky_features if f['risk_level'] == 'MEDIUM'),
+            'low_risk_count': sum(1 for f in risky_features if f['risk_level'] == 'LOW'),
+            'unique_entity_types': len(set(f['entity_type'] for f in risky_features))
+        }
+        
+        response_data = {
+            'status': 'success',
+            'filename': file.filename,
+            'file_type': file_type,
+            'dataset_info': {
+                'rows': len(df),
+                'columns': len(df.columns),
+                'column_names': df.columns.tolist()
+            },
+            'summary': summary,
+            'risky_features': risky_features,
+            'available_strategies': STRATEGIES,
+            'message': f"Found {summary['risky_columns_found']} columns with PII ({summary['high_risk_count']} HIGH risk, {summary['medium_risk_count']} MEDIUM risk, {summary['low_risk_count']} LOW risk)"
+        }
+        
+        # Convert all numpy/pandas types to native Python types
+        response_data = convert_to_serializable(response_data)
+        
+        return JSONResponse(content=response_data)
+        
+    except pd.errors.EmptyDataError:
+        raise HTTPException(status_code=400, detail="File is empty or invalid CSV format")
+    except ImportError as e:
+        raise HTTPException(status_code=500, detail=f"Presidio not installed. Please install: pip install presidio-analyzer presidio-anonymizer")
+    except Exception as e:
+        print(f"Error during PII detection: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        raise HTTPException(status_code=500, detail=f"PII detection failed: {str(e)}")
+
+
+def _generate_risk_explanation(entity_type: str, risk_level: str, strategy: str) -> str:
+    """Generate human-readable explanation for why a feature is risky"""
+    
+    explanations = {
+        'CREDIT_CARD': "Credit card numbers are highly sensitive financial identifiers protected under GDPR Art. 4(1) and PCI-DSS regulations. Unauthorized disclosure can lead to fraud and identity theft.",
+        'US_SSN': "Social Security Numbers are government-issued identifiers that can be used for identity theft. They are strictly protected under US federal law and GDPR Art. 4(1).",
+        'EMAIL_ADDRESS': "Email addresses are personal identifiers under GDPR Art. 4(1) that can be used to re-identify individuals and track behavior across services.",
+        'PHONE_NUMBER': "Phone numbers are direct personal identifiers under GDPR Art. 4(1) that enable contact and can be used to track individuals.",
+        'PERSON': "Personal names are explicit identifiers under GDPR Art. 4(1) that directly identify individuals and must be protected in datasets.",
+        'LOCATION': "Location data reveals personal information about individuals' movements and residence, protected under GDPR Art. 4(1) as personal data.",
+        'IP_ADDRESS': "IP addresses are online identifiers under GDPR Art. 4(1) that can be used to track individuals across the internet.",
+        'DATE_TIME': "Temporal data can be used to re-identify individuals when combined with other data points, especially for rare events.",
+        'MEDICAL_LICENSE': "Medical information is special category data under GDPR Art. 9(1) requiring heightened protection due to health privacy concerns.",
+        'NRP': "Nationality, religious, or political views are special category data under GDPR Art. 9(1) that can lead to discrimination.",
+        'US_BANK_NUMBER': "Bank account numbers are financial identifiers that enable unauthorized access to accounts and are protected under GDPR Art. 4(1).",
+        'CRYPTO': "Cryptocurrency addresses are financial identifiers that can reveal transaction history and wealth, requiring protection.",
+        'FI_PERSONAL_ID': "Finnish personal identity numbers (HETU) are highly sensitive national identifiers under GDPR Art. 4(1) + Recital 26, granting access to government services.",
+        'SE_PERSONAL_ID': "Swedish Personnummer are national identifiers protected under GDPR Art. 4(1) + Recital 26, used across all government and private services.",
+        'NO_PERSONAL_ID': "Norwegian Fødselsnummer are national ID numbers under GDPR Art. 4(1) + Recital 26, used for all official identification.",
+        'DK_PERSONAL_ID': "Danish CPR numbers are national identifiers protected under GDPR Art. 4(1) + Recital 26, critical for government services.",
+        'FI_BUSINESS_ID': "Finnish business IDs (Y-tunnus) are organizational identifiers with lower risk than personal IDs, but still require protection for business privacy.",
+    }
+    
+    base_explanation = explanations.get(entity_type, 
+        f"{entity_type} detected as {risk_level} risk personal data under GDPR regulations requiring appropriate protection measures.")
+    
+    strategy_note = f" Recommended action: {strategy} - this {'permanently removes' if strategy == 'REMOVE' else 'anonymizes'} the data to ensure compliance."
+    
+    return base_explanation + strategy_note
--- a/data_cleaning/cleaner.py
+++ b/data_cleaning/cleaner.py
@@ -8,6 +8,7 @@ import pandas as pd
 import numpy as np
 import hashlib
 import json
+import re
 from datetime import datetime
 from typing import Dict, List, Tuple, Optional, Any
 from collections import defaultdict
@@ -375,10 +376,14 @@ class DataCleaner:
    ) -> Dict[str, List[Dict]]:
        """
        Detect PII at column and cell level (GPU-accelerated when available)
+        With intelligent filtering for false positives
        
        Returns:
            Dictionary mapping column names to list of detected entities
        """
+        import re
+        from data_cleaning.config import COLUMN_CONTEXT_FILTERS, EXCLUSION_PATTERNS
+        
        pii_detections = defaultdict(list)
        
        # Determine which columns to scan
@@ -417,26 +422,69 @@ class DataCleaner:
            )
            
            if results:
-                # Aggregate by entity type
-                entity_summary = defaultdict(lambda: {'count': 0, 'scores': []})
+                # Aggregate by entity type with filtering
+                entity_summary = defaultdict(lambda: {'count': 0, 'scores': [], 'filtered': 0})
+                filtered_reasons = []
                
                for result in results:
-                    entity_summary[result.entity_type]['count'] += 1
-                    entity_summary[result.entity_type]['scores'].append(result.score)
+                    entity_type = result.entity_type
+                    # Extract detected text from original string using start/end positions
+                    detected_text = combined_text[result.start:result.end]
+                    
+                    # ✅ FILTER 1: Column Context Filtering
+                    # Skip if entity type should be ignored based on column name
+                    context_filtered = False
+                    for pattern, ignored_entities in COLUMN_CONTEXT_FILTERS.items():
+                        if re.search(pattern, column.lower()) and entity_type in ignored_entities:
+                            context_filtered = True
+                            entity_summary[entity_type]['filtered'] += 1
+                            if f"column context ({pattern})" not in filtered_reasons:
+                                filtered_reasons.append(f"column context ({pattern})")
+                            break
+                    
+                    if context_filtered:
+                        continue
+                    
+                    # ✅ FILTER 2: Value Pattern Exclusions
+                    # Skip if detected value matches exclusion patterns
+                    pattern_filtered = False
+                    if entity_type in EXCLUSION_PATTERNS:
+                        for exclusion_pattern in EXCLUSION_PATTERNS[entity_type]:
+                            if re.match(exclusion_pattern, detected_text, re.IGNORECASE):
+                                pattern_filtered = True
+                                entity_summary[entity_type]['filtered'] += 1
+                                if f"value pattern ({exclusion_pattern[:20]}...)" not in filtered_reasons:
+                                    filtered_reasons.append(f"value pattern")
+                                break
+                    
+                    if pattern_filtered:
+                        continue
+                    
+                    # ✅ Not filtered - count as valid detection
+                    entity_summary[entity_type]['count'] += 1
+                    entity_summary[entity_type]['scores'].append(result.score)
                
-                # Store detection results
+                # Store detection results (only non-filtered)
+                detected_types = []
                for entity_type, info in entity_summary.items():
-                    avg_confidence = np.mean(info['scores'])
-                    pii_detections[column].append({
-                        'entity_type': entity_type,
-                        'count': info['count'],
-                        'avg_confidence': avg_confidence,
-                        'max_confidence': max(info['scores']),
-                        'min_confidence': min(info['scores'])
-                    })
+                    if info['count'] > 0:  # Only include if we have valid (non-filtered) detections
+                        avg_confidence = np.mean(info['scores'])
+                        pii_detections[column].append({
+                            'entity_type': entity_type,
+                            'count': info['count'],
+                            'avg_confidence': avg_confidence,
+                            'max_confidence': max(info['scores']),
+                            'min_confidence': min(info['scores'])
+                        })
+                        detected_types.append(entity_type)
                
-                detected_types = [d['entity_type'] for d in pii_detections[column]]
-                print(f"✓ Found: {', '.join(detected_types)}")
+                if detected_types:
+                    print(f"✓ Found: {', '.join(detected_types)}")
+                elif any(info['filtered'] > 0 for info in entity_summary.values()):
+                    total_filtered = sum(info['filtered'] for info in entity_summary.values())
+                    print(f"(filtered {total_filtered} false positives: {', '.join(filtered_reasons[:2])})")
+                else:
+                    print("(no PII)")
            else:
                print("(no PII)")
        
--- a/data_cleaning/config.py
+++ b/data_cleaning/config.py
@@ -126,9 +126,63 @@ GDPR_COMPLIANCE = {
 # Presidio Analyzer Settings
 PRESIDIO_CONFIG = {
    'language': 'en',
-    'score_threshold': 0.5,  # Minimum confidence to report
+    'score_threshold': 0.6,  # Minimum confidence to report (raised from 0.5 to reduce false positives)
    'entities': None,  # None = detect all, or specify list like ['EMAIL_ADDRESS', 'PHONE_NUMBER']
-    'allow_list': [],  # Terms to ignore (e.g., company names that look like PII)
+    'allow_list': ['l1', 'l2', 'L1', 'L2', 'NA', 'N/A', 'null', 'none'],  # Common non-PII values
+}
+
+# Column Context Filters - Ignore specific entity types based on column name patterns
+# This prevents false positives when column names provide context
+COLUMN_CONTEXT_FILTERS = {
+    # Column name pattern (regex) -> List of entity types to IGNORE in that column
+    r'.*credit.*': ['US_DRIVER_LICENSE', 'US_PASSPORT', 'PERSON'],
+    r'.*rating.*': ['US_DRIVER_LICENSE', 'US_PASSPORT'],
+    r'.*level.*': ['US_DRIVER_LICENSE', 'US_PASSPORT'],
+    r'.*score.*': ['US_DRIVER_LICENSE', 'US_PASSPORT', 'PERSON'],
+    r'.*category.*': ['US_DRIVER_LICENSE', 'PERSON'],
+    r'.*status.*': ['US_DRIVER_LICENSE', 'PERSON'],
+    r'.*type.*': ['US_DRIVER_LICENSE', 'PERSON'],
+    r'.*grade.*': ['US_DRIVER_LICENSE', 'PERSON'],
+    r'.*class.*': ['US_DRIVER_LICENSE', 'PERSON'],
+    r'.*rank.*': ['US_DRIVER_LICENSE', 'PERSON'],
+    r'.*tier.*': ['US_DRIVER_LICENSE', 'PERSON'],
+    r'.*segment.*': ['US_DRIVER_LICENSE', 'PERSON'],
+    r'.*group.*': ['US_DRIVER_LICENSE', 'PERSON'],
+    r'.*code.*': ['PERSON'],  # Codes are rarely names
+    r'.*id$': ['PERSON'],  # IDs ending in 'id' are rarely names
+    r'.*_id$': ['PERSON'],  # Same for underscore_id
+}
+
+# Value Pattern Exclusions - Ignore values matching these patterns for specific entity types
+# This catches false positives based on the actual detected value format
+EXCLUSION_PATTERNS = {
+    'US_DRIVER_LICENSE': [
+        r'^[a-zA-Z]\d{1,2}$',  # Single letter + 1-2 digits (e.g., l1, l2, A1, B12)
+        r'^[a-zA-Z]{1,2}$',    # 1-2 letters only (e.g., A, AB)
+        r'^level\s*\d+$',      # "level 1", "level 2", etc.
+        r'^tier\s*\d+$',       # "tier 1", "tier 2", etc.
+        r'^grade\s*[a-zA-Z]$', # "grade A", "grade B", etc.
+    ],
+    'US_PASSPORT': [
+        r'^[a-zA-Z]\d{1,2}$',  # Single letter + 1-2 digits
+        r'^[a-zA-Z]{1,2}$',    # 1-2 letters only
+    ],
+    'PERSON': [
+        r'^(admin|user|guest|system|default|test|demo)$',  # Generic usernames
+        r'^[a-zA-Z]\d*$',      # Single letter with optional numbers (A, A1, B2)
+        r'^(yes|no|true|false|y|n|t|f)$',  # Boolean values
+        r'^(male|female|m|f|other)$',  # Gender categories
+        r'^(low|medium|high|good|bad|excellent|poor)$',  # Rating values
+    ],
+    'EMAIL_ADDRESS': [
+        r'^(test|demo|example|sample)@',  # Test emails
+        r'@(test|demo|example|sample)\.',  # Test domains
+    ],
+    'PHONE_NUMBER': [
+        r'^(000|111|222|333|444|555|666|777|888|999)[-\s]',  # Fake phone patterns
+        r'^1{6,}$',  # All 1s
+        r'^0{6,}$',  # All 0s
+    ],
 }

 # Custom Recognizers (domain-specific patterns)
--- a/frontend/components/try/CenterPanel.tsx
+++ b/frontend/components/try/CenterPanel.tsx
@@ -2,7 +2,7 @@
 import { TryTab } from "./Sidebar";
 import { useState, useRef, useCallback, useEffect } from "react";
 import { saveLatestUpload, getLatestUpload, deleteLatestUpload } from "../../lib/indexeddb";
-import { analyzeDataset, cleanDataset, getReportUrl, type AnalyzeResponse, type CleanResponse } from "../../lib/api";
+import { analyzeDataset, cleanDataset, detectPII, getReportUrl, type AnalyzeResponse, type CleanResponse, type DetectPIIResponse } from "../../lib/api";

 interface CenterPanelProps {
 	tab: TryTab;
@@ -38,6 +38,7 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 	// Analysis results
 	const [analyzeResult, setAnalyzeResult] = useState<AnalyzeResponse | null>(null);
 	const [cleanResult, setCleanResult] = useState<CleanResponse | null>(null);
+	const [piiDetectionResult, setPIIDetectionResult] = useState<DetectPIIResponse | null>(null);

 	const reset = () => {
 		setFileMeta(null);
@@ -46,6 +47,7 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 		setProgressLabel("Processing");
 		setTablePreview(null);
 		setError(null);
+		setPIIDetectionResult(null);
 	};

 	// Handle API calls
@@ -71,6 +73,27 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 		}
 	};

+	const handleDetectPII = async () => {
+		if (!uploadedFile) {
+			setError("No file uploaded");
+			return;
+		}
+		
+		setIsProcessing(true);
+		setError(null);
+		setProgressLabel("Detecting PII...");
+		
+		try {
+			const result = await detectPII(uploadedFile);
+			setPIIDetectionResult(result);
+			setProgressLabel("PII detection complete!");
+		} catch (err: any) {
+			setError(err.message || "PII detection failed");
+		} finally {
+			setIsProcessing(false);
+		}
+	};
+
 	const handleClean = async () => {
 		if (!uploadedFile) {
 			setError("No file uploaded");
@@ -380,6 +403,18 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 													</div>
 												)}
 												
+												{piiDetectionResult && (
+													<div className="mt-3 p-3 bg-blue-50 border border-blue-200 rounded-md text-sm text-blue-700">
+														🔍 PII Detection complete! Found {piiDetectionResult.summary.risky_columns_found} risky columns in {piiDetectionResult.file_type.toUpperCase()} file.
+														<div className="mt-1 text-xs">
+															<span className="font-semibold text-red-700">{piiDetectionResult.summary.high_risk_count} HIGH</span> • 
+															<span className="font-semibold text-orange-600 ml-1">{piiDetectionResult.summary.medium_risk_count} MEDIUM</span> • 
+															<span className="font-semibold text-yellow-600 ml-1">{piiDetectionResult.summary.low_risk_count} LOW</span>
+														</div>
+														<p className="mt-2 text-xs">Review detected risks in the "Bias & Risk Mitigation" tab to choose anonymization strategies.</p>
+													</div>
+												)}
+												
 												{analyzeResult && (
 													<div className="mt-3 p-3 bg-green-50 border border-green-200 rounded-md text-sm text-green-700">
 														✅ Analysis complete! View results in tabs.
@@ -426,6 +461,7 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 																setLoadedFromCache(false);
 																setAnalyzeResult(null);
 																setCleanResult(null);
+																setPIIDetectionResult(null);
 															}}
 														className="text-xs rounded-md border px-3 py-1.5 hover:bg-slate-50"
 													>
@@ -433,11 +469,11 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 													</button>
 													<button
 														type="button"
-														onClick={handleClean}
+														onClick={handleDetectPII}
 														disabled={isProcessing}
-														className="text-xs rounded-md bg-green-600 text-white px-3 py-1.5 hover:bg-green-500 disabled:opacity-50 disabled:cursor-not-allowed"
+														className="text-xs rounded-md bg-blue-600 text-white px-3 py-1.5 hover:bg-blue-500 disabled:opacity-50 disabled:cursor-not-allowed"
 													>
-														{isProcessing ? "Processing..." : "Clean (PII)"}
+														{isProcessing ? "Processing..." : "🔍 Detect PII"}
 													</button>
 													<button
 														type="button"
@@ -445,7 +481,7 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 														disabled={isProcessing}
 														className="text-xs rounded-md bg-brand-600 text-white px-3 py-1.5 hover:bg-brand-500 disabled:opacity-50 disabled:cursor-not-allowed"
 													>
-														{isProcessing ? "Processing..." : "Analyze"}
+														{isProcessing ? "Processing..." : "⚡ Analyze"}
 													</button>
 												</div>
 								</div>
@@ -1100,20 +1136,190 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 				);
 			case "bias-risk-mitigation":
 				return (
-					<div className="space-y-4">
-						<h2 className="text-xl font-semibold">Mitigation Suggestions</h2>
-						{analyzeResult && analyzeResult.recommendations.length > 0 ? (
-							<div className="space-y-2">
-								{analyzeResult.recommendations.map((rec, i) => (
-									<div key={i} className="p-3 bg-blue-50 border border-blue-200 rounded-md text-sm">
-										{rec}
+					<div className="space-y-6">
+						<div>
+							<h2 className="text-2xl font-bold mb-2">PII Detection & Anonymization Strategy</h2>
+							<p className="text-sm text-slate-600">Review detected risky features and choose how to anonymize them</p>
+						</div>
+						
+						{piiDetectionResult ? (
+							<div className="space-y-6">
+								{/* File Info Banner */}
+								<div className="p-3 bg-slate-100 border border-slate-300 rounded-lg text-sm">
+									<div className="flex items-center gap-3">
+										<span className="font-semibold text-slate-700">File:</span>
+										<code className="px-2 py-1 bg-white rounded border border-slate-200">{piiDetectionResult.filename}</code>
+										<span className="px-2 py-0.5 bg-blue-100 text-blue-800 text-xs font-semibold rounded">
+											{piiDetectionResult.file_type.toUpperCase()}
+										</span>
+										<span className="text-slate-600">
+											{piiDetectionResult.dataset_info.rows} rows × {piiDetectionResult.dataset_info.columns} columns
+										</span>
 									</div>
-								))}
+								</div>
+
+								{/* Summary Card */}
+								<div className="p-6 bg-gradient-to-br from-blue-50 to-indigo-50 rounded-xl border-2 border-blue-200">
+									<div className="grid grid-cols-1 md:grid-cols-4 gap-4">
+										<div>
+											<div className="text-xs font-semibold text-blue-700 mb-1">TOTAL COLUMNS SCANNED</div>
+											<div className="text-3xl font-bold text-blue-900">{piiDetectionResult.summary.total_columns_scanned}</div>
+										</div>
+										<div>
+											<div className="text-xs font-semibold text-red-700 mb-1">HIGH RISK</div>
+											<div className="text-3xl font-bold text-red-900">{piiDetectionResult.summary.high_risk_count}</div>
+											<div className="text-xs text-slate-600">Must remove</div>
+										</div>
+										<div>
+											<div className="text-xs font-semibold text-orange-700 mb-1">MEDIUM RISK</div>
+											<div className="text-3xl font-bold text-orange-900">{piiDetectionResult.summary.medium_risk_count}</div>
+											<div className="text-xs text-slate-600">Hash recommended</div>
+										</div>
+										<div>
+											<div className="text-xs font-semibold text-yellow-700 mb-1">LOW RISK</div>
+											<div className="text-3xl font-bold text-yellow-900">{piiDetectionResult.summary.low_risk_count}</div>
+											<div className="text-xs text-slate-600">Mask/generalize</div>
+										</div>
+									</div>
+									<div className="mt-4 p-3 bg-white/70 rounded-lg text-sm text-slate-700">
+										{piiDetectionResult.message}
+									</div>
+								</div>
+
+								{/* Risky Features List */}
+								<div className="space-y-3">
+									{piiDetectionResult.risky_features.map((feature, idx) => {
+										const riskColor = 
+											feature.risk_level === 'HIGH' ? 'red' :
+											feature.risk_level === 'MEDIUM' ? 'orange' :
+											feature.risk_level === 'LOW' ? 'yellow' : 'gray';
+										
+										const bgColor = 
+											feature.risk_level === 'HIGH' ? 'bg-red-50 border-red-300' :
+											feature.risk_level === 'MEDIUM' ? 'bg-orange-50 border-orange-300' :
+											feature.risk_level === 'LOW' ? 'bg-yellow-50 border-yellow-300' : 'bg-gray-50 border-gray-300';
+										
+										return (
+											<div key={idx} className={`p-5 rounded-xl border-2 ${bgColor}`}>
+												{/* Header */}
+												<div className="flex items-start justify-between mb-3">
+													<div className="flex-1">
+														<div className="flex items-center gap-3 mb-2">
+															<span className={`px-3 py-1 bg-${riskColor}-600 text-white text-xs font-bold rounded-full`}>
+																{feature.risk_level} RISK
+															</span>
+															<span className="font-mono font-bold text-lg text-slate-800">{feature.column}</span>
+														</div>
+														<div className="text-sm text-slate-700">
+															<span className="font-semibold">Detected:</span> {feature.entity_type}
+															<span className="mx-2">•</span>
+															<span className="font-semibold">Confidence:</span> {(feature.confidence * 100).toFixed(1)}%
+															<span className="mx-2">•</span>
+															<span className="font-semibold">Occurrences:</span> {feature.detection_count}
+														</div>
+													</div>
+												</div>
+
+												{/* Explanation */}
+												<div className="p-4 bg-white rounded-lg mb-4">
+													<div className="text-xs font-semibold text-slate-600 mb-2">WHY IS THIS RISKY?</div>
+													<p className="text-sm text-slate-700 leading-relaxed">{feature.explanation}</p>
+													<div className="mt-3 text-xs text-slate-600">
+														<strong>GDPR Reference:</strong> {feature.gdpr_article}
+													</div>
+												</div>
+
+												{/* Sample Values */}
+												{feature.sample_values.length > 0 && (
+													<div className="p-4 bg-white rounded-lg mb-4">
+														<div className="text-xs font-semibold text-slate-600 mb-2">SAMPLE VALUES</div>
+														<div className="flex gap-2 flex-wrap">
+															{feature.sample_values.map((val, i) => (
+																<code key={i} className="px-2 py-1 bg-slate-100 rounded text-xs text-slate-800 border border-slate-200">
+																	{val}
+																</code>
+															))}
+														</div>
+													</div>
+												)}
+
+												{/* Recommended Strategy */}
+												<div className="p-4 bg-white rounded-lg border-2 border-green-300">
+													<div className="flex items-start gap-3">
+														<div className="flex-1">
+															<div className="text-xs font-semibold text-green-700 mb-1">✓ RECOMMENDED STRATEGY</div>
+															<div className="font-bold text-lg text-slate-900">{feature.recommended_strategy}</div>
+															<div className="text-sm text-slate-700 mt-1">{feature.strategy_description}</div>
+															<div className="mt-2 flex gap-4 text-xs text-slate-600">
+																<div>
+																	<strong>Reversible:</strong> {feature.reversible ? 'Yes' : 'No'}
+																</div>
+																<div>
+																	<strong>Use Cases:</strong> {feature.use_cases.join(', ')}
+																</div>
+															</div>
+														</div>
+														<button
+															className="px-4 py-2 bg-green-600 text-white text-sm font-semibold rounded-lg hover:bg-green-500"
+															onClick={() => alert(`Apply ${feature.recommended_strategy} to ${feature.column}`)}
+														>
+															Apply
+														</button>
+													</div>
+												</div>
+
+												{/* Alternative Strategies */}
+												<details className="mt-3">
+													<summary className="text-xs font-semibold text-slate-600 cursor-pointer hover:text-slate-800">
+														View Alternative Strategies
+													</summary>
+													<div className="mt-2 grid grid-cols-1 md:grid-cols-2 gap-2">
+														{Object.entries(piiDetectionResult.available_strategies)
+															.filter(([strategy]) => strategy !== feature.recommended_strategy)
+															.map(([strategy, details]: [string, any]) => (
+																<div key={strategy} className="p-3 bg-white rounded border border-slate-200 hover:border-slate-400">
+																	<div className="font-semibold text-sm text-slate-800">{strategy}</div>
+																	<div className="text-xs text-slate-600 mt-1">{details.description}</div>
+																	<div className="mt-2 flex items-center justify-between">
+																		<span className={`px-2 py-0.5 text-xs rounded ${
+																			details.risk_level === 'HIGH' ? 'bg-red-100 text-red-800' :
+																			details.risk_level === 'MEDIUM' ? 'bg-orange-100 text-orange-800' :
+																			'bg-yellow-100 text-yellow-800'
+																		}`}>
+																			{details.risk_level} Risk
+																		</span>
+																		<button
+																			className="px-2 py-1 bg-blue-600 text-white text-xs rounded hover:bg-blue-500"
+																			onClick={() => alert(`Apply ${strategy} to ${feature.column}`)}
+																		>
+																			Use This
+																		</button>
+																	</div>
+																</div>
+															))}
+													</div>
+												</details>
+											</div>
+										);
+									})}
+								</div>
+
+								{/* Apply All Button */}
+								<div className="sticky bottom-0 p-4 bg-gradient-to-t from-white via-white to-transparent">
+									<button
+										className="w-full py-3 bg-green-600 text-white font-bold rounded-lg hover:bg-green-500 shadow-lg"
+										onClick={() => alert('Apply all recommended strategies and clean dataset')}
+									>
+										✓ Apply All Recommended Strategies & Clean Dataset
+									</button>
+								</div>
 							</div>
 						) : (
-							<p className="text-sm text-slate-600">
-								Recommendations will appear here after analysis.
-							</p>
+							<div className="text-center py-12">
+								<div className="text-6xl mb-4">🔍</div>
+								<p className="text-slate-600 mb-2">No PII detection results yet</p>
+								<p className="text-sm text-slate-500">Upload a dataset and click "🔍 Detect PII" to scan for risky features</p>
+							</div>
 						)}
 					</div>
 				);
--- a/frontend/lib/api.ts
+++ b/frontend/lib/api.ts
@@ -74,6 +74,46 @@ export interface CleanResponse {
  timestamp: string;
 }

+export interface DetectPIIResponse {
+  status: string;
+  filename: string;
+  file_type: 'csv' | 'json' | 'text';
+  dataset_info: {
+    rows: number;
+    columns: number;
+    column_names: string[];
+  };
+  summary: {
+    total_columns_scanned: number;
+    risky_columns_found: number;
+    high_risk_count: number;
+    medium_risk_count: number;
+    low_risk_count: number;
+    unique_entity_types: number;
+  };
+  risky_features: Array<{
+    column: string;
+    entity_type: string;
+    risk_level: 'HIGH' | 'MEDIUM' | 'LOW' | 'UNKNOWN';
+    confidence: number;
+    detection_count: number;
+    recommended_strategy: string;
+    strategy_description: string;
+    reversible: boolean;
+    use_cases: string[];
+    gdpr_article: string;
+    sample_values: string[];
+    explanation: string;
+  }>;
+  available_strategies: Record<string, {
+    description: string;
+    risk_level: string;
+    reversible: boolean;
+    use_cases: string[];
+  }>;
+  message: string;
+}
+
 /**
 * Analyze dataset for bias and risk
 */
@@ -114,6 +154,26 @@ export async function cleanDataset(file: File): Promise<CleanResponse> {
  return response.json();
 }

+/**
+ * Detect PII (without anonymizing) for user review
+ */
+export async function detectPII(file: File): Promise<DetectPIIResponse> {
+  const formData = new FormData();
+  formData.append('file', file);
+
+  const response = await fetch(`${API_BASE_URL}/api/detect-pii`, {
+    method: 'POST',
+    body: formData,
+  });
+
+  if (!response.ok) {
+    const error = await response.json();
+    throw new Error(error.detail || 'PII detection failed');
+  }
+
+  return response.json();
+}
+
 /**
 * Download report file
 */
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,7 +20,7 @@ python-multipart>=0.0.6
 # torch>=2.0.0 --index-url https://download.pytorch.org/whl/cu121

 # Chatbot (WIP - not exposed in API yet)
-gpt4all>=2.0.0annotated-doc==0.0.3
+gpt4all>=2.0.0
 annotated-types==0.7.0
 anyio==4.11.0
 blis==1.3.0