Merge pull request #6 from dionjoshualobo/main

Blah Blah
This commit is contained in:
ShovinDsouza
2025-11-07 21:06:07 +05:30
committed by GitHub
9 changed files with 629 additions and 36 deletions

2
.gitignore vendored
View File

@@ -74,4 +74,4 @@ frontend/nordic-privacy-ai/.next/
frontend/nordic-privacy-ai/out/
frontend/nordic-privacy-ai/node_modules/
Data
Datamain.py

BIN
GDPRArticles.pdf Normal file

Binary file not shown.

View File

@@ -8,7 +8,7 @@ from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
import os
from api.routers import analyze, clean, discovery
from api.routers import analyze, clean, discovery, detect_pii
# Create FastAPI app
app = FastAPI(
@@ -37,6 +37,7 @@ app.mount("/reports", StaticFiles(directory=reports_dir), name="reports")
# Include routers
app.include_router(analyze.router, prefix="/api", tags=["AI Governance"])
app.include_router(clean.router, prefix="/api", tags=["Data Cleaning"])
app.include_router(detect_pii.router, prefix="/api", tags=["PII Detection"])
app.include_router(discovery.router, prefix="/api", tags=["Discover sources"])
@app.get("/")

224
api/routers/detect_pii.py Normal file
View File

@@ -0,0 +1,224 @@
"""
PII Detection Router
Detects risky features WITHOUT anonymizing them
Returns risk classification for user review
"""
from fastapi import APIRouter, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
import pandas as pd
import numpy as np
import io
import os
import sys
from typing import Dict, Any, List
# Import cleaning module
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
from data_cleaning.cleaner import DataCleaner
from data_cleaning.config import (
ENTITY_STRATEGY_MAP,
STRATEGIES,
GDPR_COMPLIANCE,
COLUMN_CONTEXT_FILTERS,
EXCLUSION_PATTERNS,
get_strategy_for_entity,
get_risk_level
)
router = APIRouter()
def convert_to_serializable(obj):
"""Convert numpy/pandas types to native Python types for JSON serialization"""
if isinstance(obj, (np.integer, np.int64, np.int32)):
return int(obj)
elif isinstance(obj, (np.floating, np.float64, np.float32)):
return float(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
elif isinstance(obj, dict):
return {key: convert_to_serializable(value) for key, value in obj.items()}
elif isinstance(obj, list):
return [convert_to_serializable(item) for item in obj]
return obj
@router.post("/detect-pii")
async def detect_pii(file: UploadFile = File(...)):
"""
Detect PII in uploaded file WITHOUT anonymizing
- **file**: CSV, JSON, or TXT file to analyze for PII
Returns:
- List of risky features with severity and recommended strategies
- Detection confidence scores
- GDPR article references
- Example values for review
"""
try:
# Read uploaded file
contents = await file.read()
file_extension = os.path.splitext(file.filename)[1].lower()
# Determine file type and parse accordingly
if file_extension == '.csv':
df = pd.read_csv(io.BytesIO(contents))
file_type = 'csv'
elif file_extension == '.json':
df = pd.read_json(io.BytesIO(contents))
file_type = 'json'
elif file_extension in ['.txt', '.text']:
# For plain text, create a single-column dataframe
text_content = contents.decode('utf-8', errors='ignore')
# Split into lines for better granularity
lines = [line.strip() for line in text_content.split('\n') if line.strip()]
df = pd.DataFrame({'text_content': lines})
file_type = 'text'
else:
# Try to auto-detect format
try:
# Try CSV first
df = pd.read_csv(io.BytesIO(contents))
file_type = 'csv'
except:
try:
# Try JSON
df = pd.read_json(io.BytesIO(contents))
file_type = 'json'
except:
# Fall back to plain text
text_content = contents.decode('utf-8', errors='ignore')
lines = [line.strip() for line in text_content.split('\n') if line.strip()]
df = pd.DataFrame({'text_content': lines})
file_type = 'text'
if df.empty:
raise HTTPException(status_code=400, detail="Uploaded file is empty")
print(f"Detecting PII in: {file.filename} ({file_type} format, {len(df)} rows, {len(df.columns)} columns)")
# Initialize Data Cleaner (with GPU if available)
cleaner = DataCleaner(df, use_gpu=True)
# Detect PII without cleaning
pii_detections = cleaner._detect_pii(
df=df,
risky_columns=None, # Scan all columns
scan_all_cells=True
)
# Classify by risk level
risk_classification = cleaner._classify_risk(pii_detections)
# Build response with detailed feature information
risky_features = []
for risk_level in ['HIGH', 'MEDIUM', 'LOW', 'UNKNOWN']:
detections = risk_classification[risk_level]
for column, entities in detections.items():
for entity_info in entities:
entity_type = entity_info['entity_type']
strategy = entity_info['strategy']
# Get example values from the column (first 3 non-null)
sample_values = df[column].dropna().head(5).astype(str).tolist()
# Get GDPR article
gdpr_article = GDPR_COMPLIANCE.get(entity_type, 'Not classified')
# Get strategy details
strategy_details = STRATEGIES.get(strategy, {})
risky_features.append({
'column': column,
'entity_type': entity_type,
'risk_level': risk_level,
'confidence': float(entity_info['confidence']),
'detection_count': int(entity_info['count']),
'recommended_strategy': strategy,
'strategy_description': strategy_details.get('description', ''),
'reversible': strategy_details.get('reversible', False),
'use_cases': strategy_details.get('use_cases', []),
'gdpr_article': gdpr_article,
'sample_values': sample_values[:3], # Show 3 examples
'explanation': _generate_risk_explanation(entity_type, risk_level, strategy)
})
# Sort by risk level (HIGH -> MEDIUM -> LOW)
risk_order = {'HIGH': 0, 'MEDIUM': 1, 'LOW': 2, 'UNKNOWN': 3}
risky_features.sort(key=lambda x: (risk_order[x['risk_level']], x['column']))
# Prepare summary statistics
summary = {
'total_columns_scanned': len(df.columns),
'risky_columns_found': len(set(f['column'] for f in risky_features)),
'high_risk_count': sum(1 for f in risky_features if f['risk_level'] == 'HIGH'),
'medium_risk_count': sum(1 for f in risky_features if f['risk_level'] == 'MEDIUM'),
'low_risk_count': sum(1 for f in risky_features if f['risk_level'] == 'LOW'),
'unique_entity_types': len(set(f['entity_type'] for f in risky_features))
}
response_data = {
'status': 'success',
'filename': file.filename,
'file_type': file_type,
'dataset_info': {
'rows': len(df),
'columns': len(df.columns),
'column_names': df.columns.tolist()
},
'summary': summary,
'risky_features': risky_features,
'available_strategies': STRATEGIES,
'message': f"Found {summary['risky_columns_found']} columns with PII ({summary['high_risk_count']} HIGH risk, {summary['medium_risk_count']} MEDIUM risk, {summary['low_risk_count']} LOW risk)"
}
# Convert all numpy/pandas types to native Python types
response_data = convert_to_serializable(response_data)
return JSONResponse(content=response_data)
except pd.errors.EmptyDataError:
raise HTTPException(status_code=400, detail="File is empty or invalid CSV format")
except ImportError as e:
raise HTTPException(status_code=500, detail=f"Presidio not installed. Please install: pip install presidio-analyzer presidio-anonymizer")
except Exception as e:
print(f"Error during PII detection: {str(e)}")
import traceback
traceback.print_exc()
raise HTTPException(status_code=500, detail=f"PII detection failed: {str(e)}")
def _generate_risk_explanation(entity_type: str, risk_level: str, strategy: str) -> str:
"""Generate human-readable explanation for why a feature is risky"""
explanations = {
'CREDIT_CARD': "Credit card numbers are highly sensitive financial identifiers protected under GDPR Art. 4(1) and PCI-DSS regulations. Unauthorized disclosure can lead to fraud and identity theft.",
'US_SSN': "Social Security Numbers are government-issued identifiers that can be used for identity theft. They are strictly protected under US federal law and GDPR Art. 4(1).",
'EMAIL_ADDRESS': "Email addresses are personal identifiers under GDPR Art. 4(1) that can be used to re-identify individuals and track behavior across services.",
'PHONE_NUMBER': "Phone numbers are direct personal identifiers under GDPR Art. 4(1) that enable contact and can be used to track individuals.",
'PERSON': "Personal names are explicit identifiers under GDPR Art. 4(1) that directly identify individuals and must be protected in datasets.",
'LOCATION': "Location data reveals personal information about individuals' movements and residence, protected under GDPR Art. 4(1) as personal data.",
'IP_ADDRESS': "IP addresses are online identifiers under GDPR Art. 4(1) that can be used to track individuals across the internet.",
'DATE_TIME': "Temporal data can be used to re-identify individuals when combined with other data points, especially for rare events.",
'MEDICAL_LICENSE': "Medical information is special category data under GDPR Art. 9(1) requiring heightened protection due to health privacy concerns.",
'NRP': "Nationality, religious, or political views are special category data under GDPR Art. 9(1) that can lead to discrimination.",
'US_BANK_NUMBER': "Bank account numbers are financial identifiers that enable unauthorized access to accounts and are protected under GDPR Art. 4(1).",
'CRYPTO': "Cryptocurrency addresses are financial identifiers that can reveal transaction history and wealth, requiring protection.",
'FI_PERSONAL_ID': "Finnish personal identity numbers (HETU) are highly sensitive national identifiers under GDPR Art. 4(1) + Recital 26, granting access to government services.",
'SE_PERSONAL_ID': "Swedish Personnummer are national identifiers protected under GDPR Art. 4(1) + Recital 26, used across all government and private services.",
'NO_PERSONAL_ID': "Norwegian Fødselsnummer are national ID numbers under GDPR Art. 4(1) + Recital 26, used for all official identification.",
'DK_PERSONAL_ID': "Danish CPR numbers are national identifiers protected under GDPR Art. 4(1) + Recital 26, critical for government services.",
'FI_BUSINESS_ID': "Finnish business IDs (Y-tunnus) are organizational identifiers with lower risk than personal IDs, but still require protection for business privacy.",
}
base_explanation = explanations.get(entity_type,
f"{entity_type} detected as {risk_level} risk personal data under GDPR regulations requiring appropriate protection measures.")
strategy_note = f" Recommended action: {strategy} - this {'permanently removes' if strategy == 'REMOVE' else 'anonymizes'} the data to ensure compliance."
return base_explanation + strategy_note

View File

@@ -8,6 +8,7 @@ import pandas as pd
import numpy as np
import hashlib
import json
import re
from datetime import datetime
from typing import Dict, List, Tuple, Optional, Any
from collections import defaultdict
@@ -375,10 +376,14 @@ class DataCleaner:
) -> Dict[str, List[Dict]]:
"""
Detect PII at column and cell level (GPU-accelerated when available)
With intelligent filtering for false positives
Returns:
Dictionary mapping column names to list of detected entities
"""
import re
from data_cleaning.config import COLUMN_CONTEXT_FILTERS, EXCLUSION_PATTERNS
pii_detections = defaultdict(list)
# Determine which columns to scan
@@ -417,15 +422,52 @@ class DataCleaner:
)
if results:
# Aggregate by entity type
entity_summary = defaultdict(lambda: {'count': 0, 'scores': []})
# Aggregate by entity type with filtering
entity_summary = defaultdict(lambda: {'count': 0, 'scores': [], 'filtered': 0})
filtered_reasons = []
for result in results:
entity_summary[result.entity_type]['count'] += 1
entity_summary[result.entity_type]['scores'].append(result.score)
entity_type = result.entity_type
# Extract detected text from original string using start/end positions
detected_text = combined_text[result.start:result.end]
# Store detection results
# ✅ FILTER 1: Column Context Filtering
# Skip if entity type should be ignored based on column name
context_filtered = False
for pattern, ignored_entities in COLUMN_CONTEXT_FILTERS.items():
if re.search(pattern, column.lower()) and entity_type in ignored_entities:
context_filtered = True
entity_summary[entity_type]['filtered'] += 1
if f"column context ({pattern})" not in filtered_reasons:
filtered_reasons.append(f"column context ({pattern})")
break
if context_filtered:
continue
# ✅ FILTER 2: Value Pattern Exclusions
# Skip if detected value matches exclusion patterns
pattern_filtered = False
if entity_type in EXCLUSION_PATTERNS:
for exclusion_pattern in EXCLUSION_PATTERNS[entity_type]:
if re.match(exclusion_pattern, detected_text, re.IGNORECASE):
pattern_filtered = True
entity_summary[entity_type]['filtered'] += 1
if f"value pattern ({exclusion_pattern[:20]}...)" not in filtered_reasons:
filtered_reasons.append(f"value pattern")
break
if pattern_filtered:
continue
# ✅ Not filtered - count as valid detection
entity_summary[entity_type]['count'] += 1
entity_summary[entity_type]['scores'].append(result.score)
# Store detection results (only non-filtered)
detected_types = []
for entity_type, info in entity_summary.items():
if info['count'] > 0: # Only include if we have valid (non-filtered) detections
avg_confidence = np.mean(info['scores'])
pii_detections[column].append({
'entity_type': entity_type,
@@ -434,9 +476,15 @@ class DataCleaner:
'max_confidence': max(info['scores']),
'min_confidence': min(info['scores'])
})
detected_types.append(entity_type)
detected_types = [d['entity_type'] for d in pii_detections[column]]
if detected_types:
print(f"✓ Found: {', '.join(detected_types)}")
elif any(info['filtered'] > 0 for info in entity_summary.values()):
total_filtered = sum(info['filtered'] for info in entity_summary.values())
print(f"(filtered {total_filtered} false positives: {', '.join(filtered_reasons[:2])})")
else:
print("(no PII)")
else:
print("(no PII)")

View File

@@ -126,9 +126,63 @@ GDPR_COMPLIANCE = {
# Presidio Analyzer Settings
PRESIDIO_CONFIG = {
'language': 'en',
'score_threshold': 0.5, # Minimum confidence to report
'score_threshold': 0.6, # Minimum confidence to report (raised from 0.5 to reduce false positives)
'entities': None, # None = detect all, or specify list like ['EMAIL_ADDRESS', 'PHONE_NUMBER']
'allow_list': [], # Terms to ignore (e.g., company names that look like PII)
'allow_list': ['l1', 'l2', 'L1', 'L2', 'NA', 'N/A', 'null', 'none'], # Common non-PII values
}
# Column Context Filters - Ignore specific entity types based on column name patterns
# This prevents false positives when column names provide context
COLUMN_CONTEXT_FILTERS = {
# Column name pattern (regex) -> List of entity types to IGNORE in that column
r'.*credit.*': ['US_DRIVER_LICENSE', 'US_PASSPORT', 'PERSON'],
r'.*rating.*': ['US_DRIVER_LICENSE', 'US_PASSPORT'],
r'.*level.*': ['US_DRIVER_LICENSE', 'US_PASSPORT'],
r'.*score.*': ['US_DRIVER_LICENSE', 'US_PASSPORT', 'PERSON'],
r'.*category.*': ['US_DRIVER_LICENSE', 'PERSON'],
r'.*status.*': ['US_DRIVER_LICENSE', 'PERSON'],
r'.*type.*': ['US_DRIVER_LICENSE', 'PERSON'],
r'.*grade.*': ['US_DRIVER_LICENSE', 'PERSON'],
r'.*class.*': ['US_DRIVER_LICENSE', 'PERSON'],
r'.*rank.*': ['US_DRIVER_LICENSE', 'PERSON'],
r'.*tier.*': ['US_DRIVER_LICENSE', 'PERSON'],
r'.*segment.*': ['US_DRIVER_LICENSE', 'PERSON'],
r'.*group.*': ['US_DRIVER_LICENSE', 'PERSON'],
r'.*code.*': ['PERSON'], # Codes are rarely names
r'.*id$': ['PERSON'], # IDs ending in 'id' are rarely names
r'.*_id$': ['PERSON'], # Same for underscore_id
}
# Value Pattern Exclusions - Ignore values matching these patterns for specific entity types
# This catches false positives based on the actual detected value format
EXCLUSION_PATTERNS = {
'US_DRIVER_LICENSE': [
r'^[a-zA-Z]\d{1,2}$', # Single letter + 1-2 digits (e.g., l1, l2, A1, B12)
r'^[a-zA-Z]{1,2}$', # 1-2 letters only (e.g., A, AB)
r'^level\s*\d+$', # "level 1", "level 2", etc.
r'^tier\s*\d+$', # "tier 1", "tier 2", etc.
r'^grade\s*[a-zA-Z]$', # "grade A", "grade B", etc.
],
'US_PASSPORT': [
r'^[a-zA-Z]\d{1,2}$', # Single letter + 1-2 digits
r'^[a-zA-Z]{1,2}$', # 1-2 letters only
],
'PERSON': [
r'^(admin|user|guest|system|default|test|demo)$', # Generic usernames
r'^[a-zA-Z]\d*$', # Single letter with optional numbers (A, A1, B2)
r'^(yes|no|true|false|y|n|t|f)$', # Boolean values
r'^(male|female|m|f|other)$', # Gender categories
r'^(low|medium|high|good|bad|excellent|poor)$', # Rating values
],
'EMAIL_ADDRESS': [
r'^(test|demo|example|sample)@', # Test emails
r'@(test|demo|example|sample)\.', # Test domains
],
'PHONE_NUMBER': [
r'^(000|111|222|333|444|555|666|777|888|999)[-\s]', # Fake phone patterns
r'^1{6,}$', # All 1s
r'^0{6,}$', # All 0s
],
}
# Custom Recognizers (domain-specific patterns)

View File

@@ -2,7 +2,7 @@
import { TryTab } from "./Sidebar";
import { useState, useRef, useCallback, useEffect } from "react";
import { saveLatestUpload, getLatestUpload, deleteLatestUpload } from "../../lib/indexeddb";
import { analyzeDataset, cleanDataset, getReportUrl, type AnalyzeResponse, type CleanResponse } from "../../lib/api";
import { analyzeDataset, cleanDataset, detectPII, getReportUrl, type AnalyzeResponse, type CleanResponse, type DetectPIIResponse } from "../../lib/api";
interface CenterPanelProps {
tab: TryTab;
@@ -38,6 +38,7 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
// Analysis results
const [analyzeResult, setAnalyzeResult] = useState<AnalyzeResponse | null>(null);
const [cleanResult, setCleanResult] = useState<CleanResponse | null>(null);
const [piiDetectionResult, setPIIDetectionResult] = useState<DetectPIIResponse | null>(null);
const reset = () => {
setFileMeta(null);
@@ -46,6 +47,7 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
setProgressLabel("Processing");
setTablePreview(null);
setError(null);
setPIIDetectionResult(null);
};
// Handle API calls
@@ -71,6 +73,27 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
}
};
const handleDetectPII = async () => {
if (!uploadedFile) {
setError("No file uploaded");
return;
}
setIsProcessing(true);
setError(null);
setProgressLabel("Detecting PII...");
try {
const result = await detectPII(uploadedFile);
setPIIDetectionResult(result);
setProgressLabel("PII detection complete!");
} catch (err: any) {
setError(err.message || "PII detection failed");
} finally {
setIsProcessing(false);
}
};
const handleClean = async () => {
if (!uploadedFile) {
setError("No file uploaded");
@@ -380,6 +403,18 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
</div>
)}
{piiDetectionResult && (
<div className="mt-3 p-3 bg-blue-50 border border-blue-200 rounded-md text-sm text-blue-700">
🔍 PII Detection complete! Found {piiDetectionResult.summary.risky_columns_found} risky columns in {piiDetectionResult.file_type.toUpperCase()} file.
<div className="mt-1 text-xs">
<span className="font-semibold text-red-700">{piiDetectionResult.summary.high_risk_count} HIGH</span>
<span className="font-semibold text-orange-600 ml-1">{piiDetectionResult.summary.medium_risk_count} MEDIUM</span>
<span className="font-semibold text-yellow-600 ml-1">{piiDetectionResult.summary.low_risk_count} LOW</span>
</div>
<p className="mt-2 text-xs">Review detected risks in the "Bias & Risk Mitigation" tab to choose anonymization strategies.</p>
</div>
)}
{analyzeResult && (
<div className="mt-3 p-3 bg-green-50 border border-green-200 rounded-md text-sm text-green-700">
Analysis complete! View results in tabs.
@@ -426,6 +461,7 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
setLoadedFromCache(false);
setAnalyzeResult(null);
setCleanResult(null);
setPIIDetectionResult(null);
}}
className="text-xs rounded-md border px-3 py-1.5 hover:bg-slate-50"
>
@@ -433,11 +469,11 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
</button>
<button
type="button"
onClick={handleClean}
onClick={handleDetectPII}
disabled={isProcessing}
className="text-xs rounded-md bg-green-600 text-white px-3 py-1.5 hover:bg-green-500 disabled:opacity-50 disabled:cursor-not-allowed"
className="text-xs rounded-md bg-blue-600 text-white px-3 py-1.5 hover:bg-blue-500 disabled:opacity-50 disabled:cursor-not-allowed"
>
{isProcessing ? "Processing..." : "Clean (PII)"}
{isProcessing ? "Processing..." : "🔍 Detect PII"}
</button>
<button
type="button"
@@ -445,7 +481,7 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
disabled={isProcessing}
className="text-xs rounded-md bg-brand-600 text-white px-3 py-1.5 hover:bg-brand-500 disabled:opacity-50 disabled:cursor-not-allowed"
>
{isProcessing ? "Processing..." : "Analyze"}
{isProcessing ? "Processing..." : "Analyze"}
</button>
</div>
</div>
@@ -1100,20 +1136,190 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
);
case "bias-risk-mitigation":
return (
<div className="space-y-4">
<h2 className="text-xl font-semibold">Mitigation Suggestions</h2>
{analyzeResult && analyzeResult.recommendations.length > 0 ? (
<div className="space-y-2">
{analyzeResult.recommendations.map((rec, i) => (
<div key={i} className="p-3 bg-blue-50 border border-blue-200 rounded-md text-sm">
{rec}
<div className="space-y-6">
<div>
<h2 className="text-2xl font-bold mb-2">PII Detection & Anonymization Strategy</h2>
<p className="text-sm text-slate-600">Review detected risky features and choose how to anonymize them</p>
</div>
{piiDetectionResult ? (
<div className="space-y-6">
{/* File Info Banner */}
<div className="p-3 bg-slate-100 border border-slate-300 rounded-lg text-sm">
<div className="flex items-center gap-3">
<span className="font-semibold text-slate-700">File:</span>
<code className="px-2 py-1 bg-white rounded border border-slate-200">{piiDetectionResult.filename}</code>
<span className="px-2 py-0.5 bg-blue-100 text-blue-800 text-xs font-semibold rounded">
{piiDetectionResult.file_type.toUpperCase()}
</span>
<span className="text-slate-600">
{piiDetectionResult.dataset_info.rows} rows × {piiDetectionResult.dataset_info.columns} columns
</span>
</div>
</div>
{/* Summary Card */}
<div className="p-6 bg-gradient-to-br from-blue-50 to-indigo-50 rounded-xl border-2 border-blue-200">
<div className="grid grid-cols-1 md:grid-cols-4 gap-4">
<div>
<div className="text-xs font-semibold text-blue-700 mb-1">TOTAL COLUMNS SCANNED</div>
<div className="text-3xl font-bold text-blue-900">{piiDetectionResult.summary.total_columns_scanned}</div>
</div>
<div>
<div className="text-xs font-semibold text-red-700 mb-1">HIGH RISK</div>
<div className="text-3xl font-bold text-red-900">{piiDetectionResult.summary.high_risk_count}</div>
<div className="text-xs text-slate-600">Must remove</div>
</div>
<div>
<div className="text-xs font-semibold text-orange-700 mb-1">MEDIUM RISK</div>
<div className="text-3xl font-bold text-orange-900">{piiDetectionResult.summary.medium_risk_count}</div>
<div className="text-xs text-slate-600">Hash recommended</div>
</div>
<div>
<div className="text-xs font-semibold text-yellow-700 mb-1">LOW RISK</div>
<div className="text-3xl font-bold text-yellow-900">{piiDetectionResult.summary.low_risk_count}</div>
<div className="text-xs text-slate-600">Mask/generalize</div>
</div>
</div>
<div className="mt-4 p-3 bg-white/70 rounded-lg text-sm text-slate-700">
{piiDetectionResult.message}
</div>
</div>
{/* Risky Features List */}
<div className="space-y-3">
{piiDetectionResult.risky_features.map((feature, idx) => {
const riskColor =
feature.risk_level === 'HIGH' ? 'red' :
feature.risk_level === 'MEDIUM' ? 'orange' :
feature.risk_level === 'LOW' ? 'yellow' : 'gray';
const bgColor =
feature.risk_level === 'HIGH' ? 'bg-red-50 border-red-300' :
feature.risk_level === 'MEDIUM' ? 'bg-orange-50 border-orange-300' :
feature.risk_level === 'LOW' ? 'bg-yellow-50 border-yellow-300' : 'bg-gray-50 border-gray-300';
return (
<div key={idx} className={`p-5 rounded-xl border-2 ${bgColor}`}>
{/* Header */}
<div className="flex items-start justify-between mb-3">
<div className="flex-1">
<div className="flex items-center gap-3 mb-2">
<span className={`px-3 py-1 bg-${riskColor}-600 text-white text-xs font-bold rounded-full`}>
{feature.risk_level} RISK
</span>
<span className="font-mono font-bold text-lg text-slate-800">{feature.column}</span>
</div>
<div className="text-sm text-slate-700">
<span className="font-semibold">Detected:</span> {feature.entity_type}
<span className="mx-2"></span>
<span className="font-semibold">Confidence:</span> {(feature.confidence * 100).toFixed(1)}%
<span className="mx-2"></span>
<span className="font-semibold">Occurrences:</span> {feature.detection_count}
</div>
</div>
</div>
{/* Explanation */}
<div className="p-4 bg-white rounded-lg mb-4">
<div className="text-xs font-semibold text-slate-600 mb-2">WHY IS THIS RISKY?</div>
<p className="text-sm text-slate-700 leading-relaxed">{feature.explanation}</p>
<div className="mt-3 text-xs text-slate-600">
<strong>GDPR Reference:</strong> {feature.gdpr_article}
</div>
</div>
{/* Sample Values */}
{feature.sample_values.length > 0 && (
<div className="p-4 bg-white rounded-lg mb-4">
<div className="text-xs font-semibold text-slate-600 mb-2">SAMPLE VALUES</div>
<div className="flex gap-2 flex-wrap">
{feature.sample_values.map((val, i) => (
<code key={i} className="px-2 py-1 bg-slate-100 rounded text-xs text-slate-800 border border-slate-200">
{val}
</code>
))}
</div>
</div>
)}
{/* Recommended Strategy */}
<div className="p-4 bg-white rounded-lg border-2 border-green-300">
<div className="flex items-start gap-3">
<div className="flex-1">
<div className="text-xs font-semibold text-green-700 mb-1"> RECOMMENDED STRATEGY</div>
<div className="font-bold text-lg text-slate-900">{feature.recommended_strategy}</div>
<div className="text-sm text-slate-700 mt-1">{feature.strategy_description}</div>
<div className="mt-2 flex gap-4 text-xs text-slate-600">
<div>
<strong>Reversible:</strong> {feature.reversible ? 'Yes' : 'No'}
</div>
<div>
<strong>Use Cases:</strong> {feature.use_cases.join(', ')}
</div>
</div>
</div>
<button
className="px-4 py-2 bg-green-600 text-white text-sm font-semibold rounded-lg hover:bg-green-500"
onClick={() => alert(`Apply ${feature.recommended_strategy} to ${feature.column}`)}
>
Apply
</button>
</div>
</div>
{/* Alternative Strategies */}
<details className="mt-3">
<summary className="text-xs font-semibold text-slate-600 cursor-pointer hover:text-slate-800">
View Alternative Strategies
</summary>
<div className="mt-2 grid grid-cols-1 md:grid-cols-2 gap-2">
{Object.entries(piiDetectionResult.available_strategies)
.filter(([strategy]) => strategy !== feature.recommended_strategy)
.map(([strategy, details]: [string, any]) => (
<div key={strategy} className="p-3 bg-white rounded border border-slate-200 hover:border-slate-400">
<div className="font-semibold text-sm text-slate-800">{strategy}</div>
<div className="text-xs text-slate-600 mt-1">{details.description}</div>
<div className="mt-2 flex items-center justify-between">
<span className={`px-2 py-0.5 text-xs rounded ${
details.risk_level === 'HIGH' ? 'bg-red-100 text-red-800' :
details.risk_level === 'MEDIUM' ? 'bg-orange-100 text-orange-800' :
'bg-yellow-100 text-yellow-800'
}`}>
{details.risk_level} Risk
</span>
<button
className="px-2 py-1 bg-blue-600 text-white text-xs rounded hover:bg-blue-500"
onClick={() => alert(`Apply ${strategy} to ${feature.column}`)}
>
Use This
</button>
</div>
</div>
))}
</div>
</details>
</div>
);
})}
</div>
{/* Apply All Button */}
<div className="sticky bottom-0 p-4 bg-gradient-to-t from-white via-white to-transparent">
<button
className="w-full py-3 bg-green-600 text-white font-bold rounded-lg hover:bg-green-500 shadow-lg"
onClick={() => alert('Apply all recommended strategies and clean dataset')}
>
Apply All Recommended Strategies & Clean Dataset
</button>
</div>
</div>
) : (
<p className="text-sm text-slate-600">
Recommendations will appear here after analysis.
</p>
<div className="text-center py-12">
<div className="text-6xl mb-4">🔍</div>
<p className="text-slate-600 mb-2">No PII detection results yet</p>
<p className="text-sm text-slate-500">Upload a dataset and click "🔍 Detect PII" to scan for risky features</p>
</div>
)}
</div>
);

View File

@@ -74,6 +74,46 @@ export interface CleanResponse {
timestamp: string;
}
export interface DetectPIIResponse {
status: string;
filename: string;
file_type: 'csv' | 'json' | 'text';
dataset_info: {
rows: number;
columns: number;
column_names: string[];
};
summary: {
total_columns_scanned: number;
risky_columns_found: number;
high_risk_count: number;
medium_risk_count: number;
low_risk_count: number;
unique_entity_types: number;
};
risky_features: Array<{
column: string;
entity_type: string;
risk_level: 'HIGH' | 'MEDIUM' | 'LOW' | 'UNKNOWN';
confidence: number;
detection_count: number;
recommended_strategy: string;
strategy_description: string;
reversible: boolean;
use_cases: string[];
gdpr_article: string;
sample_values: string[];
explanation: string;
}>;
available_strategies: Record<string, {
description: string;
risk_level: string;
reversible: boolean;
use_cases: string[];
}>;
message: string;
}
/**
* Analyze dataset for bias and risk
*/
@@ -114,6 +154,26 @@ export async function cleanDataset(file: File): Promise<CleanResponse> {
return response.json();
}
/**
* Detect PII (without anonymizing) for user review
*/
export async function detectPII(file: File): Promise<DetectPIIResponse> {
const formData = new FormData();
formData.append('file', file);
const response = await fetch(`${API_BASE_URL}/api/detect-pii`, {
method: 'POST',
body: formData,
});
if (!response.ok) {
const error = await response.json();
throw new Error(error.detail || 'PII detection failed');
}
return response.json();
}
/**
* Download report file
*/

View File

@@ -20,7 +20,7 @@ python-multipart>=0.0.6
# torch>=2.0.0 --index-url https://download.pytorch.org/whl/cu121
# Chatbot (WIP - not exposed in API yet)
gpt4all>=2.0.0annotated-doc==0.0.3
gpt4all>=2.0.0
annotated-types==0.7.0
anyio==4.11.0
blis==1.3.0