diff --git a/.gitignore b/.gitignore index 7f76539..44e41e7 100644 --- a/.gitignore +++ b/.gitignore @@ -74,4 +74,4 @@ frontend/nordic-privacy-ai/.next/ frontend/nordic-privacy-ai/out/ frontend/nordic-privacy-ai/node_modules/ -Data \ No newline at end of file +Datamain.py diff --git a/GDPRArticles.pdf b/GDPRArticles.pdf new file mode 100644 index 0000000..554fd98 Binary files /dev/null and b/GDPRArticles.pdf differ diff --git a/api/main.py b/api/main.py index 5c1311b..3386bca 100644 --- a/api/main.py +++ b/api/main.py @@ -8,7 +8,7 @@ from fastapi.middleware.cors import CORSMiddleware from fastapi.staticfiles import StaticFiles import os -from api.routers import analyze, clean, discovery +from api.routers import analyze, clean, discovery, detect_pii # Create FastAPI app app = FastAPI( @@ -37,6 +37,7 @@ app.mount("/reports", StaticFiles(directory=reports_dir), name="reports") # Include routers app.include_router(analyze.router, prefix="/api", tags=["AI Governance"]) app.include_router(clean.router, prefix="/api", tags=["Data Cleaning"]) +app.include_router(detect_pii.router, prefix="/api", tags=["PII Detection"]) app.include_router(discovery.router, prefix="/api", tags=["Discover sources"]) @app.get("/") diff --git a/api/routers/detect_pii.py b/api/routers/detect_pii.py new file mode 100644 index 0000000..865907b --- /dev/null +++ b/api/routers/detect_pii.py @@ -0,0 +1,224 @@ +""" +PII Detection Router +Detects risky features WITHOUT anonymizing them +Returns risk classification for user review +""" + +from fastapi import APIRouter, File, UploadFile, HTTPException +from fastapi.responses import JSONResponse +import pandas as pd +import numpy as np +import io +import os +import sys +from typing import Dict, Any, List + +# Import cleaning module +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) +from data_cleaning.cleaner import DataCleaner +from data_cleaning.config import ( + ENTITY_STRATEGY_MAP, + STRATEGIES, + GDPR_COMPLIANCE, + COLUMN_CONTEXT_FILTERS, + EXCLUSION_PATTERNS, + get_strategy_for_entity, + get_risk_level +) + +router = APIRouter() + + +def convert_to_serializable(obj): + """Convert numpy/pandas types to native Python types for JSON serialization""" + if isinstance(obj, (np.integer, np.int64, np.int32)): + return int(obj) + elif isinstance(obj, (np.floating, np.float64, np.float32)): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + elif isinstance(obj, dict): + return {key: convert_to_serializable(value) for key, value in obj.items()} + elif isinstance(obj, list): + return [convert_to_serializable(item) for item in obj] + return obj + + +@router.post("/detect-pii") +async def detect_pii(file: UploadFile = File(...)): + """ + Detect PII in uploaded file WITHOUT anonymizing + + - **file**: CSV, JSON, or TXT file to analyze for PII + + Returns: + - List of risky features with severity and recommended strategies + - Detection confidence scores + - GDPR article references + - Example values for review + """ + + try: + # Read uploaded file + contents = await file.read() + file_extension = os.path.splitext(file.filename)[1].lower() + + # Determine file type and parse accordingly + if file_extension == '.csv': + df = pd.read_csv(io.BytesIO(contents)) + file_type = 'csv' + elif file_extension == '.json': + df = pd.read_json(io.BytesIO(contents)) + file_type = 'json' + elif file_extension in ['.txt', '.text']: + # For plain text, create a single-column dataframe + text_content = contents.decode('utf-8', errors='ignore') + # Split into lines for better granularity + lines = [line.strip() for line in text_content.split('\n') if line.strip()] + df = pd.DataFrame({'text_content': lines}) + file_type = 'text' + else: + # Try to auto-detect format + try: + # Try CSV first + df = pd.read_csv(io.BytesIO(contents)) + file_type = 'csv' + except: + try: + # Try JSON + df = pd.read_json(io.BytesIO(contents)) + file_type = 'json' + except: + # Fall back to plain text + text_content = contents.decode('utf-8', errors='ignore') + lines = [line.strip() for line in text_content.split('\n') if line.strip()] + df = pd.DataFrame({'text_content': lines}) + file_type = 'text' + + if df.empty: + raise HTTPException(status_code=400, detail="Uploaded file is empty") + + print(f"Detecting PII in: {file.filename} ({file_type} format, {len(df)} rows, {len(df.columns)} columns)") + + # Initialize Data Cleaner (with GPU if available) + cleaner = DataCleaner(df, use_gpu=True) + + # Detect PII without cleaning + pii_detections = cleaner._detect_pii( + df=df, + risky_columns=None, # Scan all columns + scan_all_cells=True + ) + + # Classify by risk level + risk_classification = cleaner._classify_risk(pii_detections) + + # Build response with detailed feature information + risky_features = [] + + for risk_level in ['HIGH', 'MEDIUM', 'LOW', 'UNKNOWN']: + detections = risk_classification[risk_level] + + for column, entities in detections.items(): + for entity_info in entities: + entity_type = entity_info['entity_type'] + strategy = entity_info['strategy'] + + # Get example values from the column (first 3 non-null) + sample_values = df[column].dropna().head(5).astype(str).tolist() + + # Get GDPR article + gdpr_article = GDPR_COMPLIANCE.get(entity_type, 'Not classified') + + # Get strategy details + strategy_details = STRATEGIES.get(strategy, {}) + + risky_features.append({ + 'column': column, + 'entity_type': entity_type, + 'risk_level': risk_level, + 'confidence': float(entity_info['confidence']), + 'detection_count': int(entity_info['count']), + 'recommended_strategy': strategy, + 'strategy_description': strategy_details.get('description', ''), + 'reversible': strategy_details.get('reversible', False), + 'use_cases': strategy_details.get('use_cases', []), + 'gdpr_article': gdpr_article, + 'sample_values': sample_values[:3], # Show 3 examples + 'explanation': _generate_risk_explanation(entity_type, risk_level, strategy) + }) + + # Sort by risk level (HIGH -> MEDIUM -> LOW) + risk_order = {'HIGH': 0, 'MEDIUM': 1, 'LOW': 2, 'UNKNOWN': 3} + risky_features.sort(key=lambda x: (risk_order[x['risk_level']], x['column'])) + + # Prepare summary statistics + summary = { + 'total_columns_scanned': len(df.columns), + 'risky_columns_found': len(set(f['column'] for f in risky_features)), + 'high_risk_count': sum(1 for f in risky_features if f['risk_level'] == 'HIGH'), + 'medium_risk_count': sum(1 for f in risky_features if f['risk_level'] == 'MEDIUM'), + 'low_risk_count': sum(1 for f in risky_features if f['risk_level'] == 'LOW'), + 'unique_entity_types': len(set(f['entity_type'] for f in risky_features)) + } + + response_data = { + 'status': 'success', + 'filename': file.filename, + 'file_type': file_type, + 'dataset_info': { + 'rows': len(df), + 'columns': len(df.columns), + 'column_names': df.columns.tolist() + }, + 'summary': summary, + 'risky_features': risky_features, + 'available_strategies': STRATEGIES, + 'message': f"Found {summary['risky_columns_found']} columns with PII ({summary['high_risk_count']} HIGH risk, {summary['medium_risk_count']} MEDIUM risk, {summary['low_risk_count']} LOW risk)" + } + + # Convert all numpy/pandas types to native Python types + response_data = convert_to_serializable(response_data) + + return JSONResponse(content=response_data) + + except pd.errors.EmptyDataError: + raise HTTPException(status_code=400, detail="File is empty or invalid CSV format") + except ImportError as e: + raise HTTPException(status_code=500, detail=f"Presidio not installed. Please install: pip install presidio-analyzer presidio-anonymizer") + except Exception as e: + print(f"Error during PII detection: {str(e)}") + import traceback + traceback.print_exc() + raise HTTPException(status_code=500, detail=f"PII detection failed: {str(e)}") + + +def _generate_risk_explanation(entity_type: str, risk_level: str, strategy: str) -> str: + """Generate human-readable explanation for why a feature is risky""" + + explanations = { + 'CREDIT_CARD': "Credit card numbers are highly sensitive financial identifiers protected under GDPR Art. 4(1) and PCI-DSS regulations. Unauthorized disclosure can lead to fraud and identity theft.", + 'US_SSN': "Social Security Numbers are government-issued identifiers that can be used for identity theft. They are strictly protected under US federal law and GDPR Art. 4(1).", + 'EMAIL_ADDRESS': "Email addresses are personal identifiers under GDPR Art. 4(1) that can be used to re-identify individuals and track behavior across services.", + 'PHONE_NUMBER': "Phone numbers are direct personal identifiers under GDPR Art. 4(1) that enable contact and can be used to track individuals.", + 'PERSON': "Personal names are explicit identifiers under GDPR Art. 4(1) that directly identify individuals and must be protected in datasets.", + 'LOCATION': "Location data reveals personal information about individuals' movements and residence, protected under GDPR Art. 4(1) as personal data.", + 'IP_ADDRESS': "IP addresses are online identifiers under GDPR Art. 4(1) that can be used to track individuals across the internet.", + 'DATE_TIME': "Temporal data can be used to re-identify individuals when combined with other data points, especially for rare events.", + 'MEDICAL_LICENSE': "Medical information is special category data under GDPR Art. 9(1) requiring heightened protection due to health privacy concerns.", + 'NRP': "Nationality, religious, or political views are special category data under GDPR Art. 9(1) that can lead to discrimination.", + 'US_BANK_NUMBER': "Bank account numbers are financial identifiers that enable unauthorized access to accounts and are protected under GDPR Art. 4(1).", + 'CRYPTO': "Cryptocurrency addresses are financial identifiers that can reveal transaction history and wealth, requiring protection.", + 'FI_PERSONAL_ID': "Finnish personal identity numbers (HETU) are highly sensitive national identifiers under GDPR Art. 4(1) + Recital 26, granting access to government services.", + 'SE_PERSONAL_ID': "Swedish Personnummer are national identifiers protected under GDPR Art. 4(1) + Recital 26, used across all government and private services.", + 'NO_PERSONAL_ID': "Norwegian Fødselsnummer are national ID numbers under GDPR Art. 4(1) + Recital 26, used for all official identification.", + 'DK_PERSONAL_ID': "Danish CPR numbers are national identifiers protected under GDPR Art. 4(1) + Recital 26, critical for government services.", + 'FI_BUSINESS_ID': "Finnish business IDs (Y-tunnus) are organizational identifiers with lower risk than personal IDs, but still require protection for business privacy.", + } + + base_explanation = explanations.get(entity_type, + f"{entity_type} detected as {risk_level} risk personal data under GDPR regulations requiring appropriate protection measures.") + + strategy_note = f" Recommended action: {strategy} - this {'permanently removes' if strategy == 'REMOVE' else 'anonymizes'} the data to ensure compliance." + + return base_explanation + strategy_note diff --git a/data_cleaning/cleaner.py b/data_cleaning/cleaner.py index 658b1b8..61e97f5 100644 --- a/data_cleaning/cleaner.py +++ b/data_cleaning/cleaner.py @@ -8,6 +8,7 @@ import pandas as pd import numpy as np import hashlib import json +import re from datetime import datetime from typing import Dict, List, Tuple, Optional, Any from collections import defaultdict @@ -375,10 +376,14 @@ class DataCleaner: ) -> Dict[str, List[Dict]]: """ Detect PII at column and cell level (GPU-accelerated when available) + With intelligent filtering for false positives Returns: Dictionary mapping column names to list of detected entities """ + import re + from data_cleaning.config import COLUMN_CONTEXT_FILTERS, EXCLUSION_PATTERNS + pii_detections = defaultdict(list) # Determine which columns to scan @@ -417,26 +422,69 @@ class DataCleaner: ) if results: - # Aggregate by entity type - entity_summary = defaultdict(lambda: {'count': 0, 'scores': []}) + # Aggregate by entity type with filtering + entity_summary = defaultdict(lambda: {'count': 0, 'scores': [], 'filtered': 0}) + filtered_reasons = [] for result in results: - entity_summary[result.entity_type]['count'] += 1 - entity_summary[result.entity_type]['scores'].append(result.score) + entity_type = result.entity_type + # Extract detected text from original string using start/end positions + detected_text = combined_text[result.start:result.end] + + # ✅ FILTER 1: Column Context Filtering + # Skip if entity type should be ignored based on column name + context_filtered = False + for pattern, ignored_entities in COLUMN_CONTEXT_FILTERS.items(): + if re.search(pattern, column.lower()) and entity_type in ignored_entities: + context_filtered = True + entity_summary[entity_type]['filtered'] += 1 + if f"column context ({pattern})" not in filtered_reasons: + filtered_reasons.append(f"column context ({pattern})") + break + + if context_filtered: + continue + + # ✅ FILTER 2: Value Pattern Exclusions + # Skip if detected value matches exclusion patterns + pattern_filtered = False + if entity_type in EXCLUSION_PATTERNS: + for exclusion_pattern in EXCLUSION_PATTERNS[entity_type]: + if re.match(exclusion_pattern, detected_text, re.IGNORECASE): + pattern_filtered = True + entity_summary[entity_type]['filtered'] += 1 + if f"value pattern ({exclusion_pattern[:20]}...)" not in filtered_reasons: + filtered_reasons.append(f"value pattern") + break + + if pattern_filtered: + continue + + # ✅ Not filtered - count as valid detection + entity_summary[entity_type]['count'] += 1 + entity_summary[entity_type]['scores'].append(result.score) - # Store detection results + # Store detection results (only non-filtered) + detected_types = [] for entity_type, info in entity_summary.items(): - avg_confidence = np.mean(info['scores']) - pii_detections[column].append({ - 'entity_type': entity_type, - 'count': info['count'], - 'avg_confidence': avg_confidence, - 'max_confidence': max(info['scores']), - 'min_confidence': min(info['scores']) - }) + if info['count'] > 0: # Only include if we have valid (non-filtered) detections + avg_confidence = np.mean(info['scores']) + pii_detections[column].append({ + 'entity_type': entity_type, + 'count': info['count'], + 'avg_confidence': avg_confidence, + 'max_confidence': max(info['scores']), + 'min_confidence': min(info['scores']) + }) + detected_types.append(entity_type) - detected_types = [d['entity_type'] for d in pii_detections[column]] - print(f"✓ Found: {', '.join(detected_types)}") + if detected_types: + print(f"✓ Found: {', '.join(detected_types)}") + elif any(info['filtered'] > 0 for info in entity_summary.values()): + total_filtered = sum(info['filtered'] for info in entity_summary.values()) + print(f"(filtered {total_filtered} false positives: {', '.join(filtered_reasons[:2])})") + else: + print("(no PII)") else: print("(no PII)") diff --git a/data_cleaning/config.py b/data_cleaning/config.py index f5944c5..a6ee24a 100644 --- a/data_cleaning/config.py +++ b/data_cleaning/config.py @@ -126,9 +126,63 @@ GDPR_COMPLIANCE = { # Presidio Analyzer Settings PRESIDIO_CONFIG = { 'language': 'en', - 'score_threshold': 0.5, # Minimum confidence to report + 'score_threshold': 0.6, # Minimum confidence to report (raised from 0.5 to reduce false positives) 'entities': None, # None = detect all, or specify list like ['EMAIL_ADDRESS', 'PHONE_NUMBER'] - 'allow_list': [], # Terms to ignore (e.g., company names that look like PII) + 'allow_list': ['l1', 'l2', 'L1', 'L2', 'NA', 'N/A', 'null', 'none'], # Common non-PII values +} + +# Column Context Filters - Ignore specific entity types based on column name patterns +# This prevents false positives when column names provide context +COLUMN_CONTEXT_FILTERS = { + # Column name pattern (regex) -> List of entity types to IGNORE in that column + r'.*credit.*': ['US_DRIVER_LICENSE', 'US_PASSPORT', 'PERSON'], + r'.*rating.*': ['US_DRIVER_LICENSE', 'US_PASSPORT'], + r'.*level.*': ['US_DRIVER_LICENSE', 'US_PASSPORT'], + r'.*score.*': ['US_DRIVER_LICENSE', 'US_PASSPORT', 'PERSON'], + r'.*category.*': ['US_DRIVER_LICENSE', 'PERSON'], + r'.*status.*': ['US_DRIVER_LICENSE', 'PERSON'], + r'.*type.*': ['US_DRIVER_LICENSE', 'PERSON'], + r'.*grade.*': ['US_DRIVER_LICENSE', 'PERSON'], + r'.*class.*': ['US_DRIVER_LICENSE', 'PERSON'], + r'.*rank.*': ['US_DRIVER_LICENSE', 'PERSON'], + r'.*tier.*': ['US_DRIVER_LICENSE', 'PERSON'], + r'.*segment.*': ['US_DRIVER_LICENSE', 'PERSON'], + r'.*group.*': ['US_DRIVER_LICENSE', 'PERSON'], + r'.*code.*': ['PERSON'], # Codes are rarely names + r'.*id$': ['PERSON'], # IDs ending in 'id' are rarely names + r'.*_id$': ['PERSON'], # Same for underscore_id +} + +# Value Pattern Exclusions - Ignore values matching these patterns for specific entity types +# This catches false positives based on the actual detected value format +EXCLUSION_PATTERNS = { + 'US_DRIVER_LICENSE': [ + r'^[a-zA-Z]\d{1,2}$', # Single letter + 1-2 digits (e.g., l1, l2, A1, B12) + r'^[a-zA-Z]{1,2}$', # 1-2 letters only (e.g., A, AB) + r'^level\s*\d+$', # "level 1", "level 2", etc. + r'^tier\s*\d+$', # "tier 1", "tier 2", etc. + r'^grade\s*[a-zA-Z]$', # "grade A", "grade B", etc. + ], + 'US_PASSPORT': [ + r'^[a-zA-Z]\d{1,2}$', # Single letter + 1-2 digits + r'^[a-zA-Z]{1,2}$', # 1-2 letters only + ], + 'PERSON': [ + r'^(admin|user|guest|system|default|test|demo)$', # Generic usernames + r'^[a-zA-Z]\d*$', # Single letter with optional numbers (A, A1, B2) + r'^(yes|no|true|false|y|n|t|f)$', # Boolean values + r'^(male|female|m|f|other)$', # Gender categories + r'^(low|medium|high|good|bad|excellent|poor)$', # Rating values + ], + 'EMAIL_ADDRESS': [ + r'^(test|demo|example|sample)@', # Test emails + r'@(test|demo|example|sample)\.', # Test domains + ], + 'PHONE_NUMBER': [ + r'^(000|111|222|333|444|555|666|777|888|999)[-\s]', # Fake phone patterns + r'^1{6,}$', # All 1s + r'^0{6,}$', # All 0s + ], } # Custom Recognizers (domain-specific patterns) diff --git a/frontend/components/try/CenterPanel.tsx b/frontend/components/try/CenterPanel.tsx index decb9f0..db06475 100644 --- a/frontend/components/try/CenterPanel.tsx +++ b/frontend/components/try/CenterPanel.tsx @@ -2,7 +2,7 @@ import { TryTab } from "./Sidebar"; import { useState, useRef, useCallback, useEffect } from "react"; import { saveLatestUpload, getLatestUpload, deleteLatestUpload } from "../../lib/indexeddb"; -import { analyzeDataset, cleanDataset, getReportUrl, type AnalyzeResponse, type CleanResponse } from "../../lib/api"; +import { analyzeDataset, cleanDataset, detectPII, getReportUrl, type AnalyzeResponse, type CleanResponse, type DetectPIIResponse } from "../../lib/api"; interface CenterPanelProps { tab: TryTab; @@ -38,6 +38,7 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) { // Analysis results const [analyzeResult, setAnalyzeResult] = useState(null); const [cleanResult, setCleanResult] = useState(null); + const [piiDetectionResult, setPIIDetectionResult] = useState(null); const reset = () => { setFileMeta(null); @@ -46,6 +47,7 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) { setProgressLabel("Processing"); setTablePreview(null); setError(null); + setPIIDetectionResult(null); }; // Handle API calls @@ -71,6 +73,27 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) { } }; + const handleDetectPII = async () => { + if (!uploadedFile) { + setError("No file uploaded"); + return; + } + + setIsProcessing(true); + setError(null); + setProgressLabel("Detecting PII..."); + + try { + const result = await detectPII(uploadedFile); + setPIIDetectionResult(result); + setProgressLabel("PII detection complete!"); + } catch (err: any) { + setError(err.message || "PII detection failed"); + } finally { + setIsProcessing(false); + } + }; + const handleClean = async () => { if (!uploadedFile) { setError("No file uploaded"); @@ -380,6 +403,18 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) { )} + {piiDetectionResult && ( +
+ 🔍 PII Detection complete! Found {piiDetectionResult.summary.risky_columns_found} risky columns in {piiDetectionResult.file_type.toUpperCase()} file. +
+ {piiDetectionResult.summary.high_risk_count} HIGH • + {piiDetectionResult.summary.medium_risk_count} MEDIUM • + {piiDetectionResult.summary.low_risk_count} LOW +
+

Review detected risks in the "Bias & Risk Mitigation" tab to choose anonymization strategies.

+
+ )} + {analyzeResult && (
✅ Analysis complete! View results in tabs. @@ -426,6 +461,7 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) { setLoadedFromCache(false); setAnalyzeResult(null); setCleanResult(null); + setPIIDetectionResult(null); }} className="text-xs rounded-md border px-3 py-1.5 hover:bg-slate-50" > @@ -433,11 +469,11 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
@@ -1100,20 +1136,190 @@ export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) { ); case "bias-risk-mitigation": return ( -
-

Mitigation Suggestions

- {analyzeResult && analyzeResult.recommendations.length > 0 ? ( -
- {analyzeResult.recommendations.map((rec, i) => ( -
- {rec} +
+
+

PII Detection & Anonymization Strategy

+

Review detected risky features and choose how to anonymize them

+
+ + {piiDetectionResult ? ( +
+ {/* File Info Banner */} +
+
+ File: + {piiDetectionResult.filename} + + {piiDetectionResult.file_type.toUpperCase()} + + + {piiDetectionResult.dataset_info.rows} rows × {piiDetectionResult.dataset_info.columns} columns +
- ))} +
+ + {/* Summary Card */} +
+
+
+
TOTAL COLUMNS SCANNED
+
{piiDetectionResult.summary.total_columns_scanned}
+
+
+
HIGH RISK
+
{piiDetectionResult.summary.high_risk_count}
+
Must remove
+
+
+
MEDIUM RISK
+
{piiDetectionResult.summary.medium_risk_count}
+
Hash recommended
+
+
+
LOW RISK
+
{piiDetectionResult.summary.low_risk_count}
+
Mask/generalize
+
+
+
+ {piiDetectionResult.message} +
+
+ + {/* Risky Features List */} +
+ {piiDetectionResult.risky_features.map((feature, idx) => { + const riskColor = + feature.risk_level === 'HIGH' ? 'red' : + feature.risk_level === 'MEDIUM' ? 'orange' : + feature.risk_level === 'LOW' ? 'yellow' : 'gray'; + + const bgColor = + feature.risk_level === 'HIGH' ? 'bg-red-50 border-red-300' : + feature.risk_level === 'MEDIUM' ? 'bg-orange-50 border-orange-300' : + feature.risk_level === 'LOW' ? 'bg-yellow-50 border-yellow-300' : 'bg-gray-50 border-gray-300'; + + return ( +
+ {/* Header */} +
+
+
+ + {feature.risk_level} RISK + + {feature.column} +
+
+ Detected: {feature.entity_type} + + Confidence: {(feature.confidence * 100).toFixed(1)}% + + Occurrences: {feature.detection_count} +
+
+
+ + {/* Explanation */} +
+
WHY IS THIS RISKY?
+

{feature.explanation}

+
+ GDPR Reference: {feature.gdpr_article} +
+
+ + {/* Sample Values */} + {feature.sample_values.length > 0 && ( +
+
SAMPLE VALUES
+
+ {feature.sample_values.map((val, i) => ( + + {val} + + ))} +
+
+ )} + + {/* Recommended Strategy */} +
+
+
+
✓ RECOMMENDED STRATEGY
+
{feature.recommended_strategy}
+
{feature.strategy_description}
+
+
+ Reversible: {feature.reversible ? 'Yes' : 'No'} +
+
+ Use Cases: {feature.use_cases.join(', ')} +
+
+
+ +
+
+ + {/* Alternative Strategies */} +
+ + View Alternative Strategies + +
+ {Object.entries(piiDetectionResult.available_strategies) + .filter(([strategy]) => strategy !== feature.recommended_strategy) + .map(([strategy, details]: [string, any]) => ( +
+
{strategy}
+
{details.description}
+
+ + {details.risk_level} Risk + + +
+
+ ))} +
+
+
+ ); + })} +
+ + {/* Apply All Button */} +
+ +
) : ( -

- Recommendations will appear here after analysis. -

+
+
🔍
+

No PII detection results yet

+

Upload a dataset and click "🔍 Detect PII" to scan for risky features

+
)}
); diff --git a/frontend/lib/api.ts b/frontend/lib/api.ts index c330ffd..aeb3059 100644 --- a/frontend/lib/api.ts +++ b/frontend/lib/api.ts @@ -74,6 +74,46 @@ export interface CleanResponse { timestamp: string; } +export interface DetectPIIResponse { + status: string; + filename: string; + file_type: 'csv' | 'json' | 'text'; + dataset_info: { + rows: number; + columns: number; + column_names: string[]; + }; + summary: { + total_columns_scanned: number; + risky_columns_found: number; + high_risk_count: number; + medium_risk_count: number; + low_risk_count: number; + unique_entity_types: number; + }; + risky_features: Array<{ + column: string; + entity_type: string; + risk_level: 'HIGH' | 'MEDIUM' | 'LOW' | 'UNKNOWN'; + confidence: number; + detection_count: number; + recommended_strategy: string; + strategy_description: string; + reversible: boolean; + use_cases: string[]; + gdpr_article: string; + sample_values: string[]; + explanation: string; + }>; + available_strategies: Record; + message: string; +} + /** * Analyze dataset for bias and risk */ @@ -114,6 +154,26 @@ export async function cleanDataset(file: File): Promise { return response.json(); } +/** + * Detect PII (without anonymizing) for user review + */ +export async function detectPII(file: File): Promise { + const formData = new FormData(); + formData.append('file', file); + + const response = await fetch(`${API_BASE_URL}/api/detect-pii`, { + method: 'POST', + body: formData, + }); + + if (!response.ok) { + const error = await response.json(); + throw new Error(error.detail || 'PII detection failed'); + } + + return response.json(); +} + /** * Download report file */ diff --git a/requirements.txt b/requirements.txt index b4e9a2c..4912a74 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,7 +20,7 @@ python-multipart>=0.0.6 # torch>=2.0.0 --index-url https://download.pytorch.org/whl/cu121 # Chatbot (WIP - not exposed in API yet) -gpt4all>=2.0.0annotated-doc==0.0.3 +gpt4all>=2.0.0 annotated-types==0.7.0 anyio==4.11.0 blis==1.3.0