Merge pull request #6 from dionjoshualobo/main

Blah Blah
2026-02-07 22:18:59 +00:00 · 2025-11-07 21:06:07 +05:30
parent 660f716813 71206d5499
commit 019f40c378
9 changed files with 629 additions and 36 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -74,4 +74,4 @@ frontend/nordic-privacy-ai/.next/
 frontend/nordic-privacy-ai/out/
 frontend/nordic-privacy-ai/node_modules/
-Data
+Datamain.py
--- a/GDPRArticles.pdf
+++ b/GDPRArticles.pdf
--- a/api/main.py
+++ b/api/main.py
@@ -8,7 +8,7 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 import os
-from api.routers import analyze, clean, discovery
+from api.routers import analyze, clean, discovery, detect_pii
 # Create FastAPI app
 app = FastAPI(
@@ -37,6 +37,7 @@ app.mount("/reports", StaticFiles(directory=reports_dir), name="reports")
 # Include routers
 app.include_router(analyze.router, prefix="/api", tags=["AI Governance"])
 app.include_router(clean.router, prefix="/api", tags=["Data Cleaning"])
 app.include_router(detect_pii.router, prefix="/api", tags=["PII Detection"])
 app.include_router(discovery.router, prefix="/api", tags=["Discover sources"])
@app.get("/")
--- a/api/routers/detect_pii.py
+++ b/api/routers/detect_pii.py
@@ -0,0 +1,224 @@
 """
 PII Detection Router
 Detects risky features WITHOUT anonymizing them
 Returns risk classification for user review
 """
 from fastapi import APIRouter, File, UploadFile, HTTPException
 from fastapi.responses import JSONResponse
 import pandas as pd
 import numpy as np
 import io
 import os
 import sys
 from typing import Dict, Any, List
 # Import cleaning module
 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
 from data_cleaning.cleaner import DataCleaner
 from data_cleaning.config import (
    ENTITY_STRATEGY_MAP, 
    STRATEGIES, 
    GDPR_COMPLIANCE,
    COLUMN_CONTEXT_FILTERS,
    EXCLUSION_PATTERNS,
    get_strategy_for_entity,
    get_risk_level
 )
 router = APIRouter()
 def convert_to_serializable(obj):
    """Convert numpy/pandas types to native Python types for JSON serialization"""
    if isinstance(obj, (np.integer, np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64, np.float32)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {key: convert_to_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(item) for item in obj]
    return obj
@router.post("/detect-pii")
 async def detect_pii(file: UploadFile = File(...)):
    """
    Detect PII in uploaded file WITHOUT anonymizing
    - **file**: CSV, JSON, or TXT file to analyze for PII
    Returns:
        - List of risky features with severity and recommended strategies
        - Detection confidence scores
        - GDPR article references
        - Example values for review
    """
    try:
        # Read uploaded file
        contents = await file.read()
        file_extension = os.path.splitext(file.filename)[1].lower()
        # Determine file type and parse accordingly
        if file_extension == '.csv':
            df = pd.read_csv(io.BytesIO(contents))
            file_type = 'csv'
        elif file_extension == '.json':
            df = pd.read_json(io.BytesIO(contents))
            file_type = 'json'
        elif file_extension in ['.txt', '.text']:
            # For plain text, create a single-column dataframe
            text_content = contents.decode('utf-8', errors='ignore')
            # Split into lines for better granularity
            lines = [line.strip() for line in text_content.split('\n') if line.strip()]
            df = pd.DataFrame({'text_content': lines})
            file_type = 'text'
        else:
            # Try to auto-detect format
            try:
                # Try CSV first
                df = pd.read_csv(io.BytesIO(contents))
                file_type = 'csv'
            except:
                try:
                    # Try JSON
                    df = pd.read_json(io.BytesIO(contents))
                    file_type = 'json'
                except:
                    # Fall back to plain text
                    text_content = contents.decode('utf-8', errors='ignore')
                    lines = [line.strip() for line in text_content.split('\n') if line.strip()]
                    df = pd.DataFrame({'text_content': lines})
                    file_type = 'text'
        if df.empty:
            raise HTTPException(status_code=400, detail="Uploaded file is empty")
        print(f"Detecting PII in: {file.filename} ({file_type} format, {len(df)} rows, {len(df.columns)} columns)")
        # Initialize Data Cleaner (with GPU if available)
        cleaner = DataCleaner(df, use_gpu=True)
        # Detect PII without cleaning
        pii_detections = cleaner._detect_pii(
            df=df,
            risky_columns=None,  # Scan all columns
            scan_all_cells=True
        )
        # Classify by risk level
        risk_classification = cleaner._classify_risk(pii_detections)
        # Build response with detailed feature information
        risky_features = []
        for risk_level in ['HIGH', 'MEDIUM', 'LOW', 'UNKNOWN']:
            detections = risk_classification[risk_level]
            for column, entities in detections.items():
                for entity_info in entities:
                    entity_type = entity_info['entity_type']
                    strategy = entity_info['strategy']
                    # Get example values from the column (first 3 non-null)
                    sample_values = df[column].dropna().head(5).astype(str).tolist()
                    # Get GDPR article
                    gdpr_article = GDPR_COMPLIANCE.get(entity_type, 'Not classified')
                    # Get strategy details
                    strategy_details = STRATEGIES.get(strategy, {})
                    risky_features.append({
                        'column': column,
                        'entity_type': entity_type,
                        'risk_level': risk_level,
                        'confidence': float(entity_info['confidence']),
                        'detection_count': int(entity_info['count']),
                        'recommended_strategy': strategy,
                        'strategy_description': strategy_details.get('description', ''),
                        'reversible': strategy_details.get('reversible', False),
                        'use_cases': strategy_details.get('use_cases', []),
                        'gdpr_article': gdpr_article,
                        'sample_values': sample_values[:3],  # Show 3 examples
                        'explanation': _generate_risk_explanation(entity_type, risk_level, strategy)
                    })
        # Sort by risk level (HIGH -> MEDIUM -> LOW)
        risk_order = {'HIGH': 0, 'MEDIUM': 1, 'LOW': 2, 'UNKNOWN': 3}
        risky_features.sort(key=lambda x: (risk_order[x['risk_level']], x['column']))
        # Prepare summary statistics
        summary = {
            'total_columns_scanned': len(df.columns),
            'risky_columns_found': len(set(f['column'] for f in risky_features)),
            'high_risk_count': sum(1 for f in risky_features if f['risk_level'] == 'HIGH'),
            'medium_risk_count': sum(1 for f in risky_features if f['risk_level'] == 'MEDIUM'),
            'low_risk_count': sum(1 for f in risky_features if f['risk_level'] == 'LOW'),
            'unique_entity_types': len(set(f['entity_type'] for f in risky_features))
        }
        response_data = {
            'status': 'success',
            'filename': file.filename,
            'file_type': file_type,
            'dataset_info': {
                'rows': len(df),
                'columns': len(df.columns),
                'column_names': df.columns.tolist()
            },
            'summary': summary,
            'risky_features': risky_features,
            'available_strategies': STRATEGIES,
            'message': f"Found {summary['risky_columns_found']} columns with PII ({summary['high_risk_count']} HIGH risk, {summary['medium_risk_count']} MEDIUM risk, {summary['low_risk_count']} LOW risk)"
        }
        # Convert all numpy/pandas types to native Python types
        response_data = convert_to_serializable(response_data)
        return JSONResponse(content=response_data)
    except pd.errors.EmptyDataError:
        raise HTTPException(status_code=400, detail="File is empty or invalid CSV format")
    except ImportError as e:
        raise HTTPException(status_code=500, detail=f"Presidio not installed. Please install: pip install presidio-analyzer presidio-anonymizer")
    except Exception as e:
        print(f"Error during PII detection: {str(e)}")
        import traceback
        traceback.print_exc()
        raise HTTPException(status_code=500, detail=f"PII detection failed: {str(e)}")
 def _generate_risk_explanation(entity_type: str, risk_level: str, strategy: str) -> str:
    """Generate human-readable explanation for why a feature is risky"""
    explanations = {
        'CREDIT_CARD': "Credit card numbers are highly sensitive financial identifiers protected under GDPR Art. 4(1) and PCI-DSS regulations. Unauthorized disclosure can lead to fraud and identity theft.",
        'US_SSN': "Social Security Numbers are government-issued identifiers that can be used for identity theft. They are strictly protected under US federal law and GDPR Art. 4(1).",
        'EMAIL_ADDRESS': "Email addresses are personal identifiers under GDPR Art. 4(1) that can be used to re-identify individuals and track behavior across services.",
        'PHONE_NUMBER': "Phone numbers are direct personal identifiers under GDPR Art. 4(1) that enable contact and can be used to track individuals.",
        'PERSON': "Personal names are explicit identifiers under GDPR Art. 4(1) that directly identify individuals and must be protected in datasets.",
        'LOCATION': "Location data reveals personal information about individuals' movements and residence, protected under GDPR Art. 4(1) as personal data.",
        'IP_ADDRESS': "IP addresses are online identifiers under GDPR Art. 4(1) that can be used to track individuals across the internet.",
        'DATE_TIME': "Temporal data can be used to re-identify individuals when combined with other data points, especially for rare events.",
        'MEDICAL_LICENSE': "Medical information is special category data under GDPR Art. 9(1) requiring heightened protection due to health privacy concerns.",
        'NRP': "Nationality, religious, or political views are special category data under GDPR Art. 9(1) that can lead to discrimination.",
        'US_BANK_NUMBER': "Bank account numbers are financial identifiers that enable unauthorized access to accounts and are protected under GDPR Art. 4(1).",
        'CRYPTO': "Cryptocurrency addresses are financial identifiers that can reveal transaction history and wealth, requiring protection.",
        'FI_PERSONAL_ID': "Finnish personal identity numbers (HETU) are highly sensitive national identifiers under GDPR Art. 4(1) + Recital 26, granting access to government services.",
        'SE_PERSONAL_ID': "Swedish Personnummer are national identifiers protected under GDPR Art. 4(1) + Recital 26, used across all government and private services.",
        'NO_PERSONAL_ID': "Norwegian Fødselsnummer are national ID numbers under GDPR Art. 4(1) + Recital 26, used for all official identification.",
        'DK_PERSONAL_ID': "Danish CPR numbers are national identifiers protected under GDPR Art. 4(1) + Recital 26, critical for government services.",
        'FI_BUSINESS_ID': "Finnish business IDs (Y-tunnus) are organizational identifiers with lower risk than personal IDs, but still require protection for business privacy.",
    }
    base_explanation = explanations.get(entity_type, 
        f"{entity_type} detected as {risk_level} risk personal data under GDPR regulations requiring appropriate protection measures.")
    strategy_note = f" Recommended action: {strategy} - this {'permanently removes' if strategy == 'REMOVE' else 'anonymizes'} the data to ensure compliance."
    return base_explanation + strategy_note
--- a/data_cleaning/cleaner.py
+++ b/data_cleaning/cleaner.py
@@ -8,6 +8,7 @@ import pandas as pd
 import numpy as np
 import hashlib
 import json
 import re
 from datetime import datetime
 from typing import Dict, List, Tuple, Optional, Any
 from collections import defaultdict
@@ -375,10 +376,14 @@ class DataCleaner:
    ) -> Dict[str, List[Dict]]:
        """
        Detect PII at column and cell level (GPU-accelerated when available)
        With intelligent filtering for false positives
        Returns:
            Dictionary mapping column names to list of detected entities
        """
        import re
        from data_cleaning.config import COLUMN_CONTEXT_FILTERS, EXCLUSION_PATTERNS
        pii_detections = defaultdict(list)
        # Determine which columns to scan
@@ -417,26 +422,69 @@ class DataCleaner:
            )
            if results:
-                # Aggregate by entity type
+                # Aggregate by entity type with filtering
-                entity_summary = defaultdict(lambda: {'count': 0, 'scores': []})
+                entity_summary = defaultdict(lambda: {'count': 0, 'scores': [], 'filtered': 0})
                filtered_reasons = []
                for result in results:
-                    entity_summary[result.entity_type]['count'] += 1
+                    entity_type = result.entity_type
-                    entity_summary[result.entity_type]['scores'].append(result.score)
+                    # Extract detected text from original string using start/end positions
                    detected_text = combined_text[result.start:result.end]
-                # Store detection results
+                    # ✅ FILTER 1: Column Context Filtering
                    # Skip if entity type should be ignored based on column name
                    context_filtered = False
                    for pattern, ignored_entities in COLUMN_CONTEXT_FILTERS.items():
                        if re.search(pattern, column.lower()) and entity_type in ignored_entities:
                            context_filtered = True
                            entity_summary[entity_type]['filtered'] += 1
                            if f"column context ({pattern})" not in filtered_reasons:
                                filtered_reasons.append(f"column context ({pattern})")
                            break
                    if context_filtered:
                        continue
                    # ✅ FILTER 2: Value Pattern Exclusions
                    # Skip if detected value matches exclusion patterns
                    pattern_filtered = False
                    if entity_type in EXCLUSION_PATTERNS:
                        for exclusion_pattern in EXCLUSION_PATTERNS[entity_type]:
                            if re.match(exclusion_pattern, detected_text, re.IGNORECASE):
                                pattern_filtered = True
                                entity_summary[entity_type]['filtered'] += 1
                                if f"value pattern ({exclusion_pattern[:20]}...)" not in filtered_reasons:
                                    filtered_reasons.append(f"value pattern")
                                break
                    if pattern_filtered:
                        continue
                    # ✅ Not filtered - count as valid detection
                    entity_summary[entity_type]['count'] += 1
                    entity_summary[entity_type]['scores'].append(result.score)
                # Store detection results (only non-filtered)
                detected_types = []
                for entity_type, info in entity_summary.items():
-                    avg_confidence = np.mean(info['scores'])
+                    if info['count'] > 0:  # Only include if we have valid (non-filtered) detections
-                    pii_detections[column].append({
+                        avg_confidence = np.mean(info['scores'])
-                        'entity_type': entity_type,
+                        pii_detections[column].append({
-                        'count': info['count'],
+                            'entity_type': entity_type,
-                        'avg_confidence': avg_confidence,
+                            'count': info['count'],
-                        'max_confidence': max(info['scores']),
+                            'avg_confidence': avg_confidence,
-                        'min_confidence': min(info['scores'])
+                            'max_confidence': max(info['scores']),
-                    })
+                            'min_confidence': min(info['scores'])
                        })
                        detected_types.append(entity_type)
-                detected_types = [d['entity_type'] for d in pii_detections[column]]
+                if detected_types:
-                print(f"✓ Found: {', '.join(detected_types)}")
+                    print(f"✓ Found: {', '.join(detected_types)}")
                elif any(info['filtered'] > 0 for info in entity_summary.values()):
                    total_filtered = sum(info['filtered'] for info in entity_summary.values())
                    print(f"(filtered {total_filtered} false positives: {', '.join(filtered_reasons[:2])})")
                else:
                    print("(no PII)")
            else:
                print("(no PII)")
--- a/data_cleaning/config.py
+++ b/data_cleaning/config.py
@@ -126,9 +126,63 @@ GDPR_COMPLIANCE = {
 # Presidio Analyzer Settings
 PRESIDIO_CONFIG = {
    'language': 'en',
-    'score_threshold': 0.5,  # Minimum confidence to report
+    'score_threshold': 0.6,  # Minimum confidence to report (raised from 0.5 to reduce false positives)
    'entities': None,  # None = detect all, or specify list like ['EMAIL_ADDRESS', 'PHONE_NUMBER']
-    'allow_list': [],  # Terms to ignore (e.g., company names that look like PII)
+    'allow_list': ['l1', 'l2', 'L1', 'L2', 'NA', 'N/A', 'null', 'none'],  # Common non-PII values
 }
 # Column Context Filters - Ignore specific entity types based on column name patterns
 # This prevents false positives when column names provide context
 COLUMN_CONTEXT_FILTERS = {
    # Column name pattern (regex) -> List of entity types to IGNORE in that column
    r'.*credit.*': ['US_DRIVER_LICENSE', 'US_PASSPORT', 'PERSON'],
    r'.*rating.*': ['US_DRIVER_LICENSE', 'US_PASSPORT'],
    r'.*level.*': ['US_DRIVER_LICENSE', 'US_PASSPORT'],
    r'.*score.*': ['US_DRIVER_LICENSE', 'US_PASSPORT', 'PERSON'],
    r'.*category.*': ['US_DRIVER_LICENSE', 'PERSON'],
    r'.*status.*': ['US_DRIVER_LICENSE', 'PERSON'],
    r'.*type.*': ['US_DRIVER_LICENSE', 'PERSON'],
    r'.*grade.*': ['US_DRIVER_LICENSE', 'PERSON'],
    r'.*class.*': ['US_DRIVER_LICENSE', 'PERSON'],
    r'.*rank.*': ['US_DRIVER_LICENSE', 'PERSON'],
    r'.*tier.*': ['US_DRIVER_LICENSE', 'PERSON'],
    r'.*segment.*': ['US_DRIVER_LICENSE', 'PERSON'],
    r'.*group.*': ['US_DRIVER_LICENSE', 'PERSON'],
    r'.*code.*': ['PERSON'],  # Codes are rarely names
    r'.*id$': ['PERSON'],  # IDs ending in 'id' are rarely names
    r'.*_id$': ['PERSON'],  # Same for underscore_id
 }
 # Value Pattern Exclusions - Ignore values matching these patterns for specific entity types
 # This catches false positives based on the actual detected value format
 EXCLUSION_PATTERNS = {
    'US_DRIVER_LICENSE': [
        r'^[a-zA-Z]\d{1,2}$',  # Single letter + 1-2 digits (e.g., l1, l2, A1, B12)
        r'^[a-zA-Z]{1,2}$',    # 1-2 letters only (e.g., A, AB)
        r'^level\s*\d+$',      # "level 1", "level 2", etc.
        r'^tier\s*\d+$',       # "tier 1", "tier 2", etc.
        r'^grade\s*[a-zA-Z]$', # "grade A", "grade B", etc.
    ],
    'US_PASSPORT': [
        r'^[a-zA-Z]\d{1,2}$',  # Single letter + 1-2 digits
        r'^[a-zA-Z]{1,2}$',    # 1-2 letters only
    ],
    'PERSON': [
        r'^(admin|user|guest|system|default|test|demo)$',  # Generic usernames
        r'^[a-zA-Z]\d*$',      # Single letter with optional numbers (A, A1, B2)
        r'^(yes|no|true|false|y|n|t|f)$',  # Boolean values
        r'^(male|female|m|f|other)$',  # Gender categories
        r'^(low|medium|high|good|bad|excellent|poor)$',  # Rating values
    ],
    'EMAIL_ADDRESS': [
        r'^(test|demo|example|sample)@',  # Test emails
        r'@(test|demo|example|sample)\.',  # Test domains
    ],
    'PHONE_NUMBER': [
        r'^(000|111|222|333|444|555|666|777|888|999)[-\s]',  # Fake phone patterns
        r'^1{6,}$',  # All 1s
        r'^0{6,}$',  # All 0s
    ],
 }
 # Custom Recognizers (domain-specific patterns)
--- a/frontend/components/try/CenterPanel.tsx
+++ b/frontend/components/try/CenterPanel.tsx
@@ -2,7 +2,7 @@
 import { TryTab } from "./Sidebar";
 import { useState, useRef, useCallback, useEffect } from "react";
 import { saveLatestUpload, getLatestUpload, deleteLatestUpload } from "../../lib/indexeddb";
-import { analyzeDataset, cleanDataset, getReportUrl, type AnalyzeResponse, type CleanResponse } from "../../lib/api";
+import { analyzeDataset, cleanDataset, detectPII, getReportUrl, type AnalyzeResponse, type CleanResponse, type DetectPIIResponse } from "../../lib/api";
 interface CenterPanelProps {
 	tab: TryTab;
@@ -38,6 +38,7 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 	// Analysis results
 	const [analyzeResult, setAnalyzeResult] = useState<AnalyzeResponse | null>(null);
 	const [cleanResult, setCleanResult] = useState<CleanResponse | null>(null);
 	const [piiDetectionResult, setPIIDetectionResult] = useState<DetectPIIResponse | null>(null);
 	const reset = () => {
 		setFileMeta(null);
@@ -46,6 +47,7 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 		setProgressLabel("Processing");
 		setTablePreview(null);
 		setError(null);
 		setPIIDetectionResult(null);
 	};
 	// Handle API calls
@@ -71,6 +73,27 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 		}
 	};
 	const handleDetectPII = async () => {
 		if (!uploadedFile) {
 			setError("No file uploaded");
 			return;
 		}
 		setIsProcessing(true);
 		setError(null);
 		setProgressLabel("Detecting PII...");
 		try {
 			const result = await detectPII(uploadedFile);
 			setPIIDetectionResult(result);
 			setProgressLabel("PII detection complete!");
 		} catch (err: any) {
 			setError(err.message || "PII detection failed");
 		} finally {
 			setIsProcessing(false);
 		}
 	};
 	const handleClean = async () => {
 		if (!uploadedFile) {
 			setError("No file uploaded");
@@ -380,6 +403,18 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 													</div>
 												)}
 												{piiDetectionResult && (
 													<div className="mt-3 p-3 bg-blue-50 border border-blue-200 rounded-md text-sm text-blue-700">
 														🔍 PII Detection complete! Found {piiDetectionResult.summary.risky_columns_found} risky columns in {piiDetectionResult.file_type.toUpperCase()} file.
 														<div className="mt-1 text-xs">
 															<span className="font-semibold text-red-700">{piiDetectionResult.summary.high_risk_count} HIGH</span> • 
 															<span className="font-semibold text-orange-600 ml-1">{piiDetectionResult.summary.medium_risk_count} MEDIUM</span> • 
 															<span className="font-semibold text-yellow-600 ml-1">{piiDetectionResult.summary.low_risk_count} LOW</span>
 														</div>
 														<p className="mt-2 text-xs">Review detected risks in the "Bias & Risk Mitigation" tab to choose anonymization strategies.</p>
 													</div>
 												)}
 												{analyzeResult && (
 													<div className="mt-3 p-3 bg-green-50 border border-green-200 rounded-md text-sm text-green-700">
 														✅ Analysis complete! View results in tabs.
@@ -426,6 +461,7 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 																setLoadedFromCache(false);
 																setAnalyzeResult(null);
 																setCleanResult(null);
 																setPIIDetectionResult(null);
 															}}
 														className="text-xs rounded-md border px-3 py-1.5 hover:bg-slate-50"
 													>
@@ -433,11 +469,11 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 													</button>
 													<button
 														type="button"
-														onClick={handleClean}
+														onClick={handleDetectPII}
 														disabled={isProcessing}
-														className="text-xs rounded-md bg-green-600 text-white px-3 py-1.5 hover:bg-green-500 disabled:opacity-50 disabled:cursor-not-allowed"
+														className="text-xs rounded-md bg-blue-600 text-white px-3 py-1.5 hover:bg-blue-500 disabled:opacity-50 disabled:cursor-not-allowed"
 													>
-														{isProcessing ? "Processing..." : "Clean (PII)"}
+														{isProcessing ? "Processing..." : "🔍 Detect PII"}
 													</button>
 													<button
 														type="button"
@@ -445,7 +481,7 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 														disabled={isProcessing}
 														className="text-xs rounded-md bg-brand-600 text-white px-3 py-1.5 hover:bg-brand-500 disabled:opacity-50 disabled:cursor-not-allowed"
 													>
-														{isProcessing ? "Processing..." : "Analyze"}
+														{isProcessing ? "Processing..." : "⚡ Analyze"}
 													</button>
 												</div>
 								</div>
@@ -1100,20 +1136,190 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
 				);
 			case "bias-risk-mitigation":
 				return (
-					<div className="space-y-4">
+					<div className="space-y-6">
-						<h2 className="text-xl font-semibold">Mitigation Suggestions</h2>
+						<div>
-						{analyzeResult && analyzeResult.recommendations.length > 0 ? (
+							<h2 className="text-2xl font-bold mb-2">PII Detection & Anonymization Strategy</h2>
-							<div className="space-y-2">
+							<p className="text-sm text-slate-600">Review detected risky features and choose how to anonymize them</p>
-								{analyzeResult.recommendations.map((rec, i) => (
+						</div>
-									<div key={i} className="p-3 bg-blue-50 border border-blue-200 rounded-md text-sm">
+						
-										{rec}
+						{piiDetectionResult ? (
 							<div className="space-y-6">
 								{/* File Info Banner */}
 								<div className="p-3 bg-slate-100 border border-slate-300 rounded-lg text-sm">
 									<div className="flex items-center gap-3">
 										<span className="font-semibold text-slate-700">File:</span>
 										<code className="px-2 py-1 bg-white rounded border border-slate-200">{piiDetectionResult.filename}</code>
 										<span className="px-2 py-0.5 bg-blue-100 text-blue-800 text-xs font-semibold rounded">
 											{piiDetectionResult.file_type.toUpperCase()}
 										</span>
 										<span className="text-slate-600">
 											{piiDetectionResult.dataset_info.rows} rows × {piiDetectionResult.dataset_info.columns} columns
 										</span>
 									</div>
-								))}
+								</div>
 								{/* Summary Card */}
 								<div className="p-6 bg-gradient-to-br from-blue-50 to-indigo-50 rounded-xl border-2 border-blue-200">
 									<div className="grid grid-cols-1 md:grid-cols-4 gap-4">
 										<div>
 											<div className="text-xs font-semibold text-blue-700 mb-1">TOTAL COLUMNS SCANNED</div>
 											<div className="text-3xl font-bold text-blue-900">{piiDetectionResult.summary.total_columns_scanned}</div>
 										</div>
 										<div>
 											<div className="text-xs font-semibold text-red-700 mb-1">HIGH RISK</div>
 											<div className="text-3xl font-bold text-red-900">{piiDetectionResult.summary.high_risk_count}</div>
 											<div className="text-xs text-slate-600">Must remove</div>
 										</div>
 										<div>
 											<div className="text-xs font-semibold text-orange-700 mb-1">MEDIUM RISK</div>
 											<div className="text-3xl font-bold text-orange-900">{piiDetectionResult.summary.medium_risk_count}</div>
 											<div className="text-xs text-slate-600">Hash recommended</div>
 										</div>
 										<div>
 											<div className="text-xs font-semibold text-yellow-700 mb-1">LOW RISK</div>
 											<div className="text-3xl font-bold text-yellow-900">{piiDetectionResult.summary.low_risk_count}</div>
 											<div className="text-xs text-slate-600">Mask/generalize</div>
 										</div>
 									</div>
 									<div className="mt-4 p-3 bg-white/70 rounded-lg text-sm text-slate-700">
 										{piiDetectionResult.message}
 									</div>
 								</div>
 								{/* Risky Features List */}
 								<div className="space-y-3">
 									{piiDetectionResult.risky_features.map((feature, idx) => {
 										const riskColor = 
 											feature.risk_level === 'HIGH' ? 'red' :
 											feature.risk_level === 'MEDIUM' ? 'orange' :
 											feature.risk_level === 'LOW' ? 'yellow' : 'gray';
 										const bgColor = 
 											feature.risk_level === 'HIGH' ? 'bg-red-50 border-red-300' :
 											feature.risk_level === 'MEDIUM' ? 'bg-orange-50 border-orange-300' :
 											feature.risk_level === 'LOW' ? 'bg-yellow-50 border-yellow-300' : 'bg-gray-50 border-gray-300';
 										return (
 											<div key={idx} className={`p-5 rounded-xl border-2 ${bgColor}`}>
 												{/* Header */}
 												<div className="flex items-start justify-between mb-3">
 													<div className="flex-1">
 														<div className="flex items-center gap-3 mb-2">
 															<span className={`px-3 py-1 bg-${riskColor}-600 text-white text-xs font-bold rounded-full`}>
 																{feature.risk_level} RISK
 															</span>
 															<span className="font-mono font-bold text-lg text-slate-800">{feature.column}</span>
 														</div>
 														<div className="text-sm text-slate-700">
 															<span className="font-semibold">Detected:</span> {feature.entity_type}
 															<span className="mx-2">•</span>
 															<span className="font-semibold">Confidence:</span> {(feature.confidence * 100).toFixed(1)}%
 															<span className="mx-2">•</span>
 															<span className="font-semibold">Occurrences:</span> {feature.detection_count}
 														</div>
 													</div>
 												</div>
 												{/* Explanation */}
 												<div className="p-4 bg-white rounded-lg mb-4">
 													<div className="text-xs font-semibold text-slate-600 mb-2">WHY IS THIS RISKY?</div>
 													<p className="text-sm text-slate-700 leading-relaxed">{feature.explanation}</p>
 													<div className="mt-3 text-xs text-slate-600">
 														<strong>GDPR Reference:</strong> {feature.gdpr_article}
 													</div>
 												</div>
 												{/* Sample Values */}
 												{feature.sample_values.length > 0 && (
 													<div className="p-4 bg-white rounded-lg mb-4">
 														<div className="text-xs font-semibold text-slate-600 mb-2">SAMPLE VALUES</div>
 														<div className="flex gap-2 flex-wrap">
 															{feature.sample_values.map((val, i) => (
 																<code key={i} className="px-2 py-1 bg-slate-100 rounded text-xs text-slate-800 border border-slate-200">
 																	{val}
 																</code>
 															))}
 														</div>
 													</div>
 												)}
 												{/* Recommended Strategy */}
 												<div className="p-4 bg-white rounded-lg border-2 border-green-300">
 													<div className="flex items-start gap-3">
 														<div className="flex-1">
 															<div className="text-xs font-semibold text-green-700 mb-1">✓ RECOMMENDED STRATEGY</div>
 															<div className="font-bold text-lg text-slate-900">{feature.recommended_strategy}</div>
 															<div className="text-sm text-slate-700 mt-1">{feature.strategy_description}</div>
 															<div className="mt-2 flex gap-4 text-xs text-slate-600">
 																<div>
 																	<strong>Reversible:</strong> {feature.reversible ? 'Yes' : 'No'}
 																</div>
 																<div>
 																	<strong>Use Cases:</strong> {feature.use_cases.join(', ')}
 																</div>
 															</div>
 														</div>
 														<button
 															className="px-4 py-2 bg-green-600 text-white text-sm font-semibold rounded-lg hover:bg-green-500"
 															onClick={() => alert(`Apply ${feature.recommended_strategy} to ${feature.column}`)}
 														>
 															Apply
 														</button>
 													</div>
 												</div>
 												{/* Alternative Strategies */}
 												<details className="mt-3">
 													<summary className="text-xs font-semibold text-slate-600 cursor-pointer hover:text-slate-800">
 														View Alternative Strategies
 													</summary>
 													<div className="mt-2 grid grid-cols-1 md:grid-cols-2 gap-2">
 														{Object.entries(piiDetectionResult.available_strategies)
 															.filter(([strategy]) => strategy !== feature.recommended_strategy)
 															.map(([strategy, details]: [string, any]) => (
 																<div key={strategy} className="p-3 bg-white rounded border border-slate-200 hover:border-slate-400">
 																	<div className="font-semibold text-sm text-slate-800">{strategy}</div>
 																	<div className="text-xs text-slate-600 mt-1">{details.description}</div>
 																	<div className="mt-2 flex items-center justify-between">
 																		<span className={`px-2 py-0.5 text-xs rounded ${
 																			details.risk_level === 'HIGH' ? 'bg-red-100 text-red-800' :
 																			details.risk_level === 'MEDIUM' ? 'bg-orange-100 text-orange-800' :
 																			'bg-yellow-100 text-yellow-800'
 																		}`}>
 																			{details.risk_level} Risk
 																		</span>
 																		<button
 																			className="px-2 py-1 bg-blue-600 text-white text-xs rounded hover:bg-blue-500"
 																			onClick={() => alert(`Apply ${strategy} to ${feature.column}`)}
 																		>
 																			Use This
 																		</button>
 																	</div>
 																</div>
 															))}
 													</div>
 												</details>
 											</div>
 										);
 									})}
 								</div>
 								{/* Apply All Button */}
 								<div className="sticky bottom-0 p-4 bg-gradient-to-t from-white via-white to-transparent">
 									<button
 										className="w-full py-3 bg-green-600 text-white font-bold rounded-lg hover:bg-green-500 shadow-lg"
 										onClick={() => alert('Apply all recommended strategies and clean dataset')}
 									>
 										✓ Apply All Recommended Strategies & Clean Dataset
 									</button>
 								</div>
 							</div>
 						) : (
-							<p className="text-sm text-slate-600">
+							<div className="text-center py-12">
-								Recommendations will appear here after analysis.
+								<div className="text-6xl mb-4">🔍</div>
-							</p>
+								<p className="text-slate-600 mb-2">No PII detection results yet</p>
 								<p className="text-sm text-slate-500">Upload a dataset and click "🔍 Detect PII" to scan for risky features</p>
 							</div>
 						)}
 					</div>
 				);
--- a/frontend/lib/api.ts
+++ b/frontend/lib/api.ts
@@ -74,6 +74,46 @@ export interface CleanResponse {
  timestamp: string;
 }
 export interface DetectPIIResponse {
  status: string;
  filename: string;
  file_type: 'csv' | 'json' | 'text';
  dataset_info: {
    rows: number;
    columns: number;
    column_names: string[];
  };
  summary: {
    total_columns_scanned: number;
    risky_columns_found: number;
    high_risk_count: number;
    medium_risk_count: number;
    low_risk_count: number;
    unique_entity_types: number;
  };
  risky_features: Array<{
    column: string;
    entity_type: string;
    risk_level: 'HIGH' | 'MEDIUM' | 'LOW' | 'UNKNOWN';
    confidence: number;
    detection_count: number;
    recommended_strategy: string;
    strategy_description: string;
    reversible: boolean;
    use_cases: string[];
    gdpr_article: string;
    sample_values: string[];
    explanation: string;
  }>;
  available_strategies: Record<string, {
    description: string;
    risk_level: string;
    reversible: boolean;
    use_cases: string[];
  }>;
  message: string;
 }
 /**
 * Analyze dataset for bias and risk
 */
@@ -114,6 +154,26 @@ export async function cleanDataset(file: File): Promise<CleanResponse> {
  return response.json();
 }
 /**
 * Detect PII (without anonymizing) for user review
 */
 export async function detectPII(file: File): Promise<DetectPIIResponse> {
  const formData = new FormData();
  formData.append('file', file);
  const response = await fetch(`${API_BASE_URL}/api/detect-pii`, {
    method: 'POST',
    body: formData,
  });
  if (!response.ok) {
    const error = await response.json();
    throw new Error(error.detail || 'PII detection failed');
  }
  return response.json();
 }
 /**
 * Download report file
 */
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,7 +20,7 @@ python-multipart>=0.0.6
 # torch>=2.0.0 --index-url https://download.pytorch.org/whl/cu121
 # Chatbot (WIP - not exposed in API yet)
-gpt4all>=2.0.0annotated-doc==0.0.3
+gpt4all>=2.0.0
 annotated-types==0.7.0
 anyio==4.11.0
 blis==1.3.0