mirror of
https://github.com/PlatypusPus/MushroomEmpire.git
synced 2026-02-07 22:18:59 +00:00
ref:Cleaned the Structure
This commit is contained in:
17
.gitignore
vendored
17
.gitignore
vendored
@@ -48,9 +48,11 @@ Thumbs.db
|
|||||||
# Streamlit
|
# Streamlit
|
||||||
.streamlit/secrets.toml
|
.streamlit/secrets.toml
|
||||||
|
|
||||||
# Reports
|
# Reports and generated files
|
||||||
reports/*.json
|
reports/*.json
|
||||||
reports/*.pdf
|
reports/*.pdf
|
||||||
|
reports/*.csv
|
||||||
|
reports/*.html
|
||||||
|
|
||||||
# Logs
|
# Logs
|
||||||
*.log
|
*.log
|
||||||
@@ -59,4 +61,17 @@ reports/*.pdf
|
|||||||
*.csv
|
*.csv
|
||||||
!Datasets/loan_data.csv
|
!Datasets/loan_data.csv
|
||||||
|
|
||||||
|
# Node.js & Next.js
|
||||||
|
node_modules/
|
||||||
|
.next/
|
||||||
|
out/
|
||||||
|
.vercel
|
||||||
|
*.tsbuildinfo
|
||||||
|
next-env.d.ts
|
||||||
|
|
||||||
|
# Frontend build artifacts
|
||||||
|
frontend/nordic-privacy-ai/.next/
|
||||||
|
frontend/nordic-privacy-ai/out/
|
||||||
|
frontend/nordic-privacy-ai/node_modules/
|
||||||
|
|
||||||
Data
|
Data
|
||||||
@@ -21,6 +21,7 @@ class DataProcessor:
|
|||||||
self.categorical_features = []
|
self.categorical_features = []
|
||||||
self.feature_names = []
|
self.feature_names = []
|
||||||
self.encoders = {}
|
self.encoders = {}
|
||||||
|
self.target_encoder = None # Add target encoder
|
||||||
self.scaler = StandardScaler()
|
self.scaler = StandardScaler()
|
||||||
|
|
||||||
self.X_train = None
|
self.X_train = None
|
||||||
@@ -75,6 +76,13 @@ class DataProcessor:
|
|||||||
X = self.df[feature_cols].copy()
|
X = self.df[feature_cols].copy()
|
||||||
y = self.df[self.target_column].copy()
|
y = self.df[self.target_column].copy()
|
||||||
|
|
||||||
|
# Encode target variable if it's categorical
|
||||||
|
if y.dtype == 'object' or y.dtype.name == 'category':
|
||||||
|
self.target_encoder = LabelEncoder()
|
||||||
|
y_encoded = self.target_encoder.fit_transform(y)
|
||||||
|
y = pd.Series(y_encoded, index=y.index)
|
||||||
|
print(f"Target '{self.target_column}' encoded: {dict(enumerate(self.target_encoder.classes_))}")
|
||||||
|
|
||||||
# Encode categorical variables
|
# Encode categorical variables
|
||||||
for col in self.categorical_features:
|
for col in self.categorical_features:
|
||||||
if col in X.columns:
|
if col in X.columns:
|
||||||
|
|||||||
1
api/__init__.py
Normal file
1
api/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
# Empty __init__.py files for Python package structure
|
||||||
72
api/main.py
Normal file
72
api/main.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
"""
|
||||||
|
FastAPI Backend for Nordic Privacy AI
|
||||||
|
Provides endpoints for AI Governance analysis and data cleaning
|
||||||
|
"""
|
||||||
|
|
||||||
|
from fastapi import FastAPI
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
from fastapi.staticfiles import StaticFiles
|
||||||
|
import os
|
||||||
|
|
||||||
|
from api.routers import analyze, clean
|
||||||
|
|
||||||
|
# Create FastAPI app
|
||||||
|
app = FastAPI(
|
||||||
|
title="Nordic Privacy AI API",
|
||||||
|
description="AI-powered GDPR compliance, bias detection, and risk analysis",
|
||||||
|
version="1.0.0"
|
||||||
|
)
|
||||||
|
|
||||||
|
# CORS configuration for Next.js frontend
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=[
|
||||||
|
"http://localhost:3000", # Next.js dev server
|
||||||
|
"http://127.0.0.1:3000",
|
||||||
|
],
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mount reports directory for file downloads
|
||||||
|
reports_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "reports")
|
||||||
|
os.makedirs(reports_dir, exist_ok=True)
|
||||||
|
app.mount("/reports", StaticFiles(directory=reports_dir), name="reports")
|
||||||
|
|
||||||
|
# Include routers
|
||||||
|
app.include_router(analyze.router, prefix="/api", tags=["AI Governance"])
|
||||||
|
app.include_router(clean.router, prefix="/api", tags=["Data Cleaning"])
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
async def root():
|
||||||
|
"""Health check endpoint"""
|
||||||
|
return {
|
||||||
|
"status": "online",
|
||||||
|
"service": "Nordic Privacy AI API",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"endpoints": {
|
||||||
|
"analyze": "/api/analyze",
|
||||||
|
"clean": "/api/clean",
|
||||||
|
"docs": "/docs"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health_check():
|
||||||
|
"""Detailed health check"""
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
cuda_available = torch.cuda.is_available()
|
||||||
|
gpu_name = torch.cuda.get_device_name(0) if cuda_available else None
|
||||||
|
except:
|
||||||
|
cuda_available = False
|
||||||
|
gpu_name = None
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": "healthy",
|
||||||
|
"gpu_acceleration": {
|
||||||
|
"available": cuda_available,
|
||||||
|
"device": gpu_name or "CPU"
|
||||||
|
}
|
||||||
|
}
|
||||||
1
api/routers/__init__.py
Normal file
1
api/routers/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
# Empty __init__.py files for Python package structure
|
||||||
141
api/routers/analyze.py
Normal file
141
api/routers/analyze.py
Normal file
@@ -0,0 +1,141 @@
|
|||||||
|
"""
|
||||||
|
AI Governance Analysis Router
|
||||||
|
Handles bias detection and risk analysis endpoints
|
||||||
|
"""
|
||||||
|
|
||||||
|
from fastapi import APIRouter, File, UploadFile, HTTPException
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
# Import AI Governance modules
|
||||||
|
import sys
|
||||||
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
||||||
|
from ai_governance import AIGovernanceAnalyzer
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_serializable(obj):
|
||||||
|
"""Convert numpy/pandas types to native Python types for JSON serialization"""
|
||||||
|
if isinstance(obj, (np.integer, np.int64, np.int32)):
|
||||||
|
return int(obj)
|
||||||
|
elif isinstance(obj, (np.floating, np.float64, np.float32)):
|
||||||
|
return float(obj)
|
||||||
|
elif isinstance(obj, np.ndarray):
|
||||||
|
return obj.tolist()
|
||||||
|
elif isinstance(obj, dict):
|
||||||
|
return {key: convert_to_serializable(value) for key, value in obj.items()}
|
||||||
|
elif isinstance(obj, list):
|
||||||
|
return [convert_to_serializable(item) for item in obj]
|
||||||
|
return obj
|
||||||
|
|
||||||
|
@router.post("/analyze")
|
||||||
|
async def analyze_dataset(file: UploadFile = File(...)):
|
||||||
|
"""
|
||||||
|
Analyze uploaded dataset for bias and risk
|
||||||
|
|
||||||
|
- **file**: CSV file to analyze
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- Analysis results (bias metrics, risk assessment)
|
||||||
|
- Report file path for download
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Validate file type
|
||||||
|
if not file.filename.endswith('.csv'):
|
||||||
|
raise HTTPException(status_code=400, detail="Only CSV files are supported")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Read uploaded file
|
||||||
|
contents = await file.read()
|
||||||
|
df = pd.read_csv(io.BytesIO(contents))
|
||||||
|
|
||||||
|
if df.empty:
|
||||||
|
raise HTTPException(status_code=400, detail="Uploaded file is empty")
|
||||||
|
|
||||||
|
# Initialize AI Governance Analyzer
|
||||||
|
analyzer = AIGovernanceAnalyzer()
|
||||||
|
|
||||||
|
# Auto-detect target column and protected attributes
|
||||||
|
# Target: Last column (common convention) or first binary/categorical column
|
||||||
|
target_column = df.columns[-1]
|
||||||
|
|
||||||
|
# Protected attributes: Common sensitive columns
|
||||||
|
protected_keywords = ['gender', 'age', 'race', 'sex', 'ethnicity', 'religion', 'nationality']
|
||||||
|
protected_attributes = [col for col in df.columns
|
||||||
|
if any(keyword in col.lower() for keyword in protected_keywords)]
|
||||||
|
|
||||||
|
# If no protected attributes found, use first few categorical columns
|
||||||
|
if not protected_attributes:
|
||||||
|
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
||||||
|
protected_attributes = [col for col in categorical_cols if col != target_column][:3]
|
||||||
|
|
||||||
|
print(f"Analyzing dataset: {file.filename} ({len(df)} rows, {len(df.columns)} columns)")
|
||||||
|
print(f"Target column: {target_column}")
|
||||||
|
print(f"Protected attributes: {protected_attributes}")
|
||||||
|
|
||||||
|
# Run analysis
|
||||||
|
report = analyzer.analyze_dataframe(df, target_column, protected_attributes)
|
||||||
|
|
||||||
|
# Generate report filename
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
safe_filename = (file.filename or "dataset").replace('.csv', '')
|
||||||
|
report_filename = f"governance_report_{safe_filename}_{timestamp}.json"
|
||||||
|
report_path = os.path.join("reports", report_filename)
|
||||||
|
|
||||||
|
# Save full report to disk
|
||||||
|
full_report_path = os.path.join(
|
||||||
|
os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
|
||||||
|
report_path
|
||||||
|
)
|
||||||
|
analyzer.save_report(report, full_report_path)
|
||||||
|
|
||||||
|
# Prepare response with summary
|
||||||
|
response_data = {
|
||||||
|
"status": "success",
|
||||||
|
"filename": file.filename,
|
||||||
|
"dataset_info": {
|
||||||
|
"rows": len(df),
|
||||||
|
"columns": len(df.columns),
|
||||||
|
"features": list(df.columns)
|
||||||
|
},
|
||||||
|
"model_performance": {
|
||||||
|
"accuracy": report.get("model_metrics", {}).get("accuracy", 0),
|
||||||
|
"precision": report.get("model_metrics", {}).get("precision", 0),
|
||||||
|
"recall": report.get("model_metrics", {}).get("recall", 0),
|
||||||
|
"f1_score": report.get("model_metrics", {}).get("f1_score", 0)
|
||||||
|
},
|
||||||
|
"bias_metrics": {
|
||||||
|
"overall_bias_score": report.get("bias_metrics", {}).get("overall_bias_score", 0),
|
||||||
|
"disparate_impact": report.get("bias_metrics", {}).get("disparate_impact", {}),
|
||||||
|
"statistical_parity": report.get("bias_metrics", {}).get("statistical_parity_difference", {}),
|
||||||
|
"violations_detected": report.get("bias_metrics", {}).get("fairness_violations", [])
|
||||||
|
},
|
||||||
|
"risk_assessment": {
|
||||||
|
"overall_risk_score": report.get("risk_metrics", {}).get("overall_risk_score", 0),
|
||||||
|
"privacy_risks": report.get("risk_metrics", {}).get("privacy_risks", []),
|
||||||
|
"ethical_risks": report.get("risk_metrics", {}).get("ethical_risks", []),
|
||||||
|
"compliance_risks": report.get("risk_metrics", {}).get("compliance_risks", []),
|
||||||
|
"data_quality_risks": report.get("risk_metrics", {}).get("data_quality_risks", [])
|
||||||
|
},
|
||||||
|
"recommendations": report.get("recommendations", []),
|
||||||
|
"report_file": f"/{report_path}",
|
||||||
|
"timestamp": datetime.now().isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
# Convert all numpy/pandas types to native Python types
|
||||||
|
response_data = convert_to_serializable(response_data)
|
||||||
|
|
||||||
|
return JSONResponse(content=response_data)
|
||||||
|
|
||||||
|
except pd.errors.EmptyDataError:
|
||||||
|
raise HTTPException(status_code=400, detail="File is empty or invalid CSV format")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error during analysis: {str(e)}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
|
||||||
142
api/routers/clean.py
Normal file
142
api/routers/clean.py
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
"""
|
||||||
|
Data Cleaning Router
|
||||||
|
Handles PII detection and anonymization endpoints
|
||||||
|
"""
|
||||||
|
|
||||||
|
from fastapi import APIRouter, File, UploadFile, HTTPException
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
# Import cleaning module
|
||||||
|
import sys
|
||||||
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
||||||
|
from data_cleaning import DataCleaner
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_serializable(obj):
|
||||||
|
"""Convert numpy/pandas types to native Python types for JSON serialization"""
|
||||||
|
if isinstance(obj, (np.integer, np.int64, np.int32)):
|
||||||
|
return int(obj)
|
||||||
|
elif isinstance(obj, (np.floating, np.float64, np.float32)):
|
||||||
|
return float(obj)
|
||||||
|
elif isinstance(obj, np.ndarray):
|
||||||
|
return obj.tolist()
|
||||||
|
elif isinstance(obj, dict):
|
||||||
|
return {key: convert_to_serializable(value) for key, value in obj.items()}
|
||||||
|
elif isinstance(obj, list):
|
||||||
|
return [convert_to_serializable(item) for item in obj]
|
||||||
|
return obj
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/clean")
|
||||||
|
async def clean_dataset(file: UploadFile = File(...)):
|
||||||
|
"""
|
||||||
|
Clean uploaded dataset - detect and anonymize PII
|
||||||
|
|
||||||
|
- **file**: CSV file to clean
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- Cleaned dataset statistics
|
||||||
|
- PII detections and anonymization actions
|
||||||
|
- Report file path for download
|
||||||
|
- Cleaned CSV file path for download
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Validate file type
|
||||||
|
if not file.filename.endswith('.csv'):
|
||||||
|
raise HTTPException(status_code=400, detail="Only CSV files are supported")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Read uploaded file
|
||||||
|
contents = await file.read()
|
||||||
|
df = pd.read_csv(io.BytesIO(contents))
|
||||||
|
|
||||||
|
if df.empty:
|
||||||
|
raise HTTPException(status_code=400, detail="Uploaded file is empty")
|
||||||
|
|
||||||
|
# Initialize Data Cleaner (with GPU if available)
|
||||||
|
print(f"Cleaning dataset: {file.filename} ({len(df)} rows, {len(df.columns)} columns)")
|
||||||
|
cleaner = DataCleaner(df, use_gpu=True)
|
||||||
|
|
||||||
|
# Run cleaning (non-interactive mode for API)
|
||||||
|
cleaned_df, audit_report = cleaner.clean(
|
||||||
|
risky_features=None, # Auto-detect
|
||||||
|
interactive=False, # No user prompts in API mode
|
||||||
|
scan_all_cells=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate filenames
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
safe_filename = file.filename.replace('.csv', '')
|
||||||
|
|
||||||
|
# Save cleaned CSV
|
||||||
|
cleaned_csv_filename = f"cleaned_{safe_filename}_{timestamp}.csv"
|
||||||
|
cleaned_csv_path = os.path.join("reports", cleaned_csv_filename)
|
||||||
|
full_cleaned_csv_path = os.path.join(
|
||||||
|
os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
|
||||||
|
cleaned_csv_path
|
||||||
|
)
|
||||||
|
cleaner.save_cleaned_data(cleaned_df, full_cleaned_csv_path)
|
||||||
|
|
||||||
|
# Save audit report
|
||||||
|
audit_report_filename = f"cleaning_audit_{safe_filename}_{timestamp}.json"
|
||||||
|
audit_report_path = os.path.join("reports", audit_report_filename)
|
||||||
|
full_audit_report_path = os.path.join(
|
||||||
|
os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
|
||||||
|
audit_report_path
|
||||||
|
)
|
||||||
|
cleaner.save_audit_report(audit_report, full_audit_report_path)
|
||||||
|
|
||||||
|
# Prepare response
|
||||||
|
response_data = {
|
||||||
|
"status": "success",
|
||||||
|
"filename": file.filename,
|
||||||
|
"dataset_info": {
|
||||||
|
"original_rows": int(audit_report["metadata"]["original_rows"]),
|
||||||
|
"original_columns": int(audit_report["metadata"]["original_columns"]),
|
||||||
|
"cleaned_rows": int(audit_report["metadata"]["cleaned_rows"]),
|
||||||
|
"cleaned_columns": int(audit_report["metadata"]["cleaned_columns"])
|
||||||
|
},
|
||||||
|
"gpu_acceleration": audit_report["metadata"].get("gpu_acceleration", {
|
||||||
|
"enabled": False,
|
||||||
|
"device": "CPU"
|
||||||
|
}),
|
||||||
|
"summary": {
|
||||||
|
"columns_removed": audit_report["summary"]["columns_removed"],
|
||||||
|
"columns_anonymized": audit_report["summary"]["columns_anonymized"],
|
||||||
|
"total_cells_affected": int(audit_report["summary"]["total_cells_affected"])
|
||||||
|
},
|
||||||
|
"pii_detections": {
|
||||||
|
col: {
|
||||||
|
"action": details["action"],
|
||||||
|
"entity_types": details["entity_types_found"],
|
||||||
|
"num_affected_rows": int(details.get("num_affected_rows", 0)),
|
||||||
|
"examples": details.get("examples", [])[:2] # Show 2 examples
|
||||||
|
}
|
||||||
|
for col, details in audit_report["details"].items()
|
||||||
|
},
|
||||||
|
"gdpr_compliance": audit_report["compliance"]["gdpr_articles_applied"],
|
||||||
|
"files": {
|
||||||
|
"cleaned_csv": f"/{cleaned_csv_path}",
|
||||||
|
"audit_report": f"/{audit_report_path}"
|
||||||
|
},
|
||||||
|
"timestamp": datetime.now().isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
# Convert all numpy/pandas types to native Python types
|
||||||
|
response_data = convert_to_serializable(response_data)
|
||||||
|
|
||||||
|
return JSONResponse(content=response_data)
|
||||||
|
|
||||||
|
except pd.errors.EmptyDataError:
|
||||||
|
raise HTTPException(status_code=400, detail="File is empty or invalid CSV format")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error during cleaning: {str(e)}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Cleaning failed: {str(e)}")
|
||||||
1
api/utils/__init__.py
Normal file
1
api/utils/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
# Empty __init__.py files for Python package structure
|
||||||
12
data_cleaning/__init__.py
Normal file
12
data_cleaning/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
"""
|
||||||
|
Data Cleaning Module
|
||||||
|
Automated PII detection and GDPR-compliant anonymization
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .cleaner import DataCleaner
|
||||||
|
|
||||||
|
__version__ = '1.0.0'
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'DataCleaner'
|
||||||
|
]
|
||||||
@@ -1,167 +0,0 @@
|
|||||||
import csv
|
|
||||||
import re
|
|
||||||
from pathlib import Path
|
|
||||||
from collections import Counter
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
ROOT = Path("../Data/Politics")
|
|
||||||
|
|
||||||
# Try to import spaCy, fall back to basic extraction if not available
|
|
||||||
try:
|
|
||||||
import spacy
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
|
||||||
USE_SPACY = True
|
|
||||||
except:
|
|
||||||
USE_SPACY = False
|
|
||||||
|
|
||||||
# Regex patterns for deterministic detection
|
|
||||||
patterns = {
|
|
||||||
"EMAIL": re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"),
|
|
||||||
"PHONE": re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}"),
|
|
||||||
"UUID": re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"),
|
|
||||||
"IBAN": re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b"),
|
|
||||||
"DATE": re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}"),
|
|
||||||
"URL": re.compile(r"https?://[^\s]+"),
|
|
||||||
"SSN": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
|
|
||||||
}
|
|
||||||
|
|
||||||
def find_entities(text):
|
|
||||||
"""Extract entities using regex patterns."""
|
|
||||||
found = {}
|
|
||||||
for label, pattern in patterns.items():
|
|
||||||
matches = pattern.findall(text)
|
|
||||||
if matches:
|
|
||||||
found[label] = list(set(matches))[:5] # Limit to 5 per type
|
|
||||||
return found
|
|
||||||
|
|
||||||
def extract_with_spacy(text):
|
|
||||||
"""Extract named entities using spaCy."""
|
|
||||||
if not USE_SPACY:
|
|
||||||
return {}, {}, {}
|
|
||||||
|
|
||||||
doc = nlp(text[:10000]) # Limit text length for performance
|
|
||||||
|
|
||||||
persons = []
|
|
||||||
orgs = []
|
|
||||||
locations = []
|
|
||||||
|
|
||||||
for ent in doc.ents:
|
|
||||||
if ent.label_ == "PERSON":
|
|
||||||
persons.append(ent.text)
|
|
||||||
elif ent.label_ == "ORG":
|
|
||||||
orgs.append(ent.text)
|
|
||||||
elif ent.label_ in ["GPE", "LOC"]:
|
|
||||||
locations.append(ent.text)
|
|
||||||
|
|
||||||
# Return most common entities
|
|
||||||
return (
|
|
||||||
dict(Counter(persons).most_common(5)),
|
|
||||||
dict(Counter(orgs).most_common(5)),
|
|
||||||
dict(Counter(locations).most_common(5))
|
|
||||||
)
|
|
||||||
|
|
||||||
def extract_metadata(text, filename):
|
|
||||||
"""Extract basic metadata from text."""
|
|
||||||
metadata = {
|
|
||||||
"char_count": len(text),
|
|
||||||
"word_count": len(text.split()),
|
|
||||||
"line_count": text.count('\n') + 1,
|
|
||||||
"file_extension": Path(filename).suffix,
|
|
||||||
}
|
|
||||||
return metadata
|
|
||||||
|
|
||||||
def detect_content_type(text):
|
|
||||||
"""Heuristic content type detection."""
|
|
||||||
text_lower = text.lower()
|
|
||||||
|
|
||||||
# Check for common document types
|
|
||||||
if any(word in text_lower[:1000] for word in ['dear', 'sincerely', 'regards']):
|
|
||||||
return "letter"
|
|
||||||
elif any(word in text_lower[:500] for word in ['article', 'section', 'amendment']):
|
|
||||||
return "legal"
|
|
||||||
elif any(word in text_lower[:500] for word in ['press release', 'for immediate release']):
|
|
||||||
return "press_release"
|
|
||||||
elif re.search(r'^\s*#', text[:100], re.MULTILINE):
|
|
||||||
return "markdown"
|
|
||||||
elif '<html' in text_lower[:200]:
|
|
||||||
return "html"
|
|
||||||
else:
|
|
||||||
return "unknown"
|
|
||||||
|
|
||||||
# Define fieldnames
|
|
||||||
fieldnames = [
|
|
||||||
"filename", "file_extension", "char_count", "word_count", "line_count",
|
|
||||||
"content_type", "text_preview",
|
|
||||||
"EMAIL", "PHONE", "UUID", "IBAN", "DATE", "URL", "SSN",
|
|
||||||
"persons", "organizations", "locations"
|
|
||||||
]
|
|
||||||
|
|
||||||
print("Processing files...")
|
|
||||||
with open("discovery_dataset.csv", "w", newline="", encoding="utf-8") as csvfile:
|
|
||||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
||||||
writer.writeheader()
|
|
||||||
|
|
||||||
file_count = 0
|
|
||||||
for path in ROOT.rglob("*"):
|
|
||||||
if not path.is_file():
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Skip binary files
|
|
||||||
if path.suffix.lower() in ['.exe', '.dll', '.so', '.dylib', '.bin', '.jpg', '.png', '.gif', '.pdf']:
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
text = path.read_text(encoding="utf-8", errors="ignore")
|
|
||||||
except Exception as e:
|
|
||||||
print(f" Error reading {path.name}: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not text.strip():
|
|
||||||
continue
|
|
||||||
|
|
||||||
file_count += 1
|
|
||||||
if file_count % 10 == 0:
|
|
||||||
print(f"Processed {file_count} files...")
|
|
||||||
|
|
||||||
# Initialize row
|
|
||||||
row = {"filename": str(path.relative_to(ROOT.parent))}
|
|
||||||
|
|
||||||
# Extract metadata
|
|
||||||
metadata = extract_metadata(text, path.name)
|
|
||||||
row.update(metadata)
|
|
||||||
|
|
||||||
# Detect content type
|
|
||||||
row["content_type"] = detect_content_type(text)
|
|
||||||
row["text_preview"] = text[:500].replace('\n', ' ').replace('\r', ' ')
|
|
||||||
|
|
||||||
# Extract entities with regex
|
|
||||||
entities = find_entities(text)
|
|
||||||
for key, values in entities.items():
|
|
||||||
row[key] = "; ".join(values) if values else ""
|
|
||||||
|
|
||||||
# Fill in missing pattern fields
|
|
||||||
for pattern_key in ["EMAIL", "PHONE", "UUID", "IBAN", "DATE", "URL", "SSN"]:
|
|
||||||
if pattern_key not in row:
|
|
||||||
row[pattern_key] = ""
|
|
||||||
|
|
||||||
# Extract named entities with spaCy
|
|
||||||
if USE_SPACY:
|
|
||||||
persons, orgs, locs = extract_with_spacy(text)
|
|
||||||
row["persons"] = "; ".join([f"{k}({v})" for k, v in persons.items()])
|
|
||||||
row["organizations"] = "; ".join([f"{k}({v})" for k, v in orgs.items()])
|
|
||||||
row["locations"] = "; ".join([f"{k}({v})" for k, v in locs.items()])
|
|
||||||
else:
|
|
||||||
row["persons"] = ""
|
|
||||||
row["organizations"] = ""
|
|
||||||
row["locations"] = ""
|
|
||||||
|
|
||||||
writer.writerow(row)
|
|
||||||
|
|
||||||
print(f"\nComplete! Processed {file_count} files.")
|
|
||||||
print(f"Output: discovery_dataset.csv")
|
|
||||||
|
|
||||||
# Print summary statistics
|
|
||||||
if file_count > 0:
|
|
||||||
print("\nTo install spaCy for better entity extraction:")
|
|
||||||
print(" pip install spacy")
|
|
||||||
print(" python -m spacy download en_core_web_sm")
|
|
||||||
@@ -14,10 +14,10 @@ export default function TryPage() {
|
|||||||
<div className="flex flex-1 min-h-0">
|
<div className="flex flex-1 min-h-0">
|
||||||
<Sidebar value={tab} onChange={setTab} />
|
<Sidebar value={tab} onChange={setTab} />
|
||||||
<div className="flex-1 min-h-0 flex">
|
<div className="flex-1 min-h-0 flex">
|
||||||
<div className="flex-1 min-h-0"><CenterPanel tab={tab} /></div>
|
<div className="flex-1 min-h-0 min-w-0"><CenterPanel tab={tab} onAnalyze={() => setTab("bias-analysis")} /></div>
|
||||||
<div className="w-[360px] hidden xl:block"><ChatbotPanel /></div>
|
<div className="w-[360px] hidden xl:block"><ChatbotPanel /></div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</main>
|
</main>
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
620
frontend/components/try/CenterPanel.tsx
Normal file
620
frontend/components/try/CenterPanel.tsx
Normal file
@@ -0,0 +1,620 @@
|
|||||||
|
"use client";
|
||||||
|
import { TryTab } from "./Sidebar";
|
||||||
|
import { useState, useRef, useCallback, useEffect } from "react";
|
||||||
|
import { saveLatestUpload, getLatestUpload, deleteLatestUpload } from "../../lib/indexeddb";
|
||||||
|
import { analyzeDataset, cleanDataset, getReportUrl, type AnalyzeResponse, type CleanResponse } from "../../lib/api";
|
||||||
|
|
||||||
|
interface CenterPanelProps {
|
||||||
|
tab: TryTab;
|
||||||
|
onAnalyze?: () => void;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface UploadedFileMeta {
|
||||||
|
name: string;
|
||||||
|
size: number;
|
||||||
|
type: string;
|
||||||
|
contentPreview: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface TablePreviewData {
|
||||||
|
headers: string[];
|
||||||
|
rows: string[][];
|
||||||
|
origin: 'csv';
|
||||||
|
}
|
||||||
|
|
||||||
|
export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) {
|
||||||
|
const PREVIEW_BYTES = 64 * 1024; // read first 64KB slice for large-file preview
|
||||||
|
const [fileMeta, setFileMeta] = useState<UploadedFileMeta | null>(null);
|
||||||
|
const [uploadedFile, setUploadedFile] = useState<File | null>(null);
|
||||||
|
const [isDragging, setIsDragging] = useState(false);
|
||||||
|
const [progress, setProgress] = useState<number>(0);
|
||||||
|
const [progressLabel, setProgressLabel] = useState<string>("Processing");
|
||||||
|
const [tablePreview, setTablePreview] = useState<TablePreviewData | null>(null);
|
||||||
|
const inputRef = useRef<HTMLInputElement | null>(null);
|
||||||
|
const [loadedFromCache, setLoadedFromCache] = useState(false);
|
||||||
|
const [isProcessing, setIsProcessing] = useState(false);
|
||||||
|
const [error, setError] = useState<string | null>(null);
|
||||||
|
|
||||||
|
// Analysis results
|
||||||
|
const [analyzeResult, setAnalyzeResult] = useState<AnalyzeResponse | null>(null);
|
||||||
|
const [cleanResult, setCleanResult] = useState<CleanResponse | null>(null);
|
||||||
|
|
||||||
|
const reset = () => {
|
||||||
|
setFileMeta(null);
|
||||||
|
setUploadedFile(null);
|
||||||
|
setProgress(0);
|
||||||
|
setProgressLabel("Processing");
|
||||||
|
setTablePreview(null);
|
||||||
|
setError(null);
|
||||||
|
};
|
||||||
|
|
||||||
|
// Handle API calls
|
||||||
|
const handleAnalyze = async () => {
|
||||||
|
if (!uploadedFile) {
|
||||||
|
setError("No file uploaded");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
setIsProcessing(true);
|
||||||
|
setError(null);
|
||||||
|
setProgressLabel("Analyzing dataset...");
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await analyzeDataset(uploadedFile);
|
||||||
|
setAnalyzeResult(result);
|
||||||
|
setProgressLabel("Analysis complete!");
|
||||||
|
onAnalyze?.(); // Navigate to bias-analysis tab
|
||||||
|
} catch (err: any) {
|
||||||
|
setError(err.message || "Analysis failed");
|
||||||
|
} finally {
|
||||||
|
setIsProcessing(false);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const handleClean = async () => {
|
||||||
|
if (!uploadedFile) {
|
||||||
|
setError("No file uploaded");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
setIsProcessing(true);
|
||||||
|
setError(null);
|
||||||
|
setProgressLabel("Cleaning dataset...");
|
||||||
|
|
||||||
|
try {
|
||||||
|
const result = await cleanDataset(uploadedFile);
|
||||||
|
setCleanResult(result);
|
||||||
|
setProgressLabel("Cleaning complete!");
|
||||||
|
} catch (err: any) {
|
||||||
|
setError(err.message || "Cleaning failed");
|
||||||
|
} finally {
|
||||||
|
setIsProcessing(false);
|
||||||
|
}
|
||||||
|
}; function tryParseCSV(text: string, maxRows = 50, maxCols = 40): TablePreviewData | null {
|
||||||
|
const lines = text.split(/\r?\n/).filter(l => l.trim().length > 0);
|
||||||
|
if (lines.length < 2) return null;
|
||||||
|
const commaDensity = lines.slice(0, 10).filter(l => l.includes(',')).length;
|
||||||
|
if (commaDensity < 2) return null;
|
||||||
|
const parseLine = (line: string) => {
|
||||||
|
const out: string[] = [];
|
||||||
|
let cur = '';
|
||||||
|
let inQuotes = false;
|
||||||
|
for (let i = 0; i < line.length; i++) {
|
||||||
|
const ch = line[i];
|
||||||
|
if (ch === '"') {
|
||||||
|
if (inQuotes && line[i + 1] === '"') { cur += '"'; i++; } else { inQuotes = !inQuotes; }
|
||||||
|
} else if (ch === ',' && !inQuotes) {
|
||||||
|
out.push(cur);
|
||||||
|
cur = '';
|
||||||
|
} else { cur += ch; }
|
||||||
|
}
|
||||||
|
out.push(cur);
|
||||||
|
return out.map(c => c.trim());
|
||||||
|
};
|
||||||
|
const raw = lines.slice(0, maxRows).map(parseLine);
|
||||||
|
if (raw.length === 0) return null;
|
||||||
|
const headers = raw[0];
|
||||||
|
const colCount = Math.min(headers.length, maxCols);
|
||||||
|
const rows = raw.slice(1).map(r => r.slice(0, colCount));
|
||||||
|
return { headers: headers.slice(0, colCount), rows, origin: 'csv' };
|
||||||
|
}
|
||||||
|
|
||||||
|
// We no longer build table preview for JSON; revert JSON to raw text view.
|
||||||
|
|
||||||
|
const processFile = useCallback(async (f: File) => {
|
||||||
|
if (!f) return;
|
||||||
|
const isCSV = /\.csv$/i.test(f.name);
|
||||||
|
setProgress(0);
|
||||||
|
setUploadedFile(f); // Save the file for API calls
|
||||||
|
|
||||||
|
// For large files, show a progress bar while reading the file stream (no preview)
|
||||||
|
if (f.size > 1024 * 1024) {
|
||||||
|
setProgressLabel("Uploading");
|
||||||
|
const metaObj: UploadedFileMeta = {
|
||||||
|
name: f.name,
|
||||||
|
size: f.size,
|
||||||
|
type: f.type || "unknown",
|
||||||
|
contentPreview: `Loading partial preview (first ${Math.round(PREVIEW_BYTES/1024)}KB)...`,
|
||||||
|
};
|
||||||
|
setFileMeta(metaObj);
|
||||||
|
setTablePreview(null);
|
||||||
|
// Save to IndexedDB immediately so it persists without needing full read
|
||||||
|
(async () => {
|
||||||
|
try { await saveLatestUpload(f, metaObj); } catch {}
|
||||||
|
})();
|
||||||
|
// Read head slice for partial preview & possible CSV table extraction
|
||||||
|
try {
|
||||||
|
const headBlob = f.slice(0, PREVIEW_BYTES);
|
||||||
|
const headReader = new FileReader();
|
||||||
|
headReader.onload = async () => {
|
||||||
|
try {
|
||||||
|
const buf = headReader.result as ArrayBuffer;
|
||||||
|
const decoder = new TextDecoder();
|
||||||
|
const text = decoder.decode(buf);
|
||||||
|
setFileMeta(prev => prev ? { ...prev, contentPreview: text.slice(0, 4000) } : prev);
|
||||||
|
if (isCSV) {
|
||||||
|
const parsed = tryParseCSV(text);
|
||||||
|
setTablePreview(parsed);
|
||||||
|
} else {
|
||||||
|
setTablePreview(null);
|
||||||
|
}
|
||||||
|
try { await saveLatestUpload(f, { ...metaObj, contentPreview: text.slice(0, 4000) }); } catch {}
|
||||||
|
} catch { /* ignore */ }
|
||||||
|
};
|
||||||
|
headReader.readAsArrayBuffer(headBlob);
|
||||||
|
} catch { /* ignore */ }
|
||||||
|
// Use streaming read for progress without buffering entire file in memory
|
||||||
|
try {
|
||||||
|
const stream: ReadableStream<Uint8Array> | undefined = (typeof (f as any).stream === "function" ? (f as any).stream() : undefined);
|
||||||
|
if (stream && typeof stream.getReader === "function") {
|
||||||
|
const reader = stream.getReader();
|
||||||
|
let loaded = 0;
|
||||||
|
const total = f.size || 1;
|
||||||
|
for (;;) {
|
||||||
|
const { done, value } = await reader.read();
|
||||||
|
if (done) break;
|
||||||
|
loaded += value ? value.length : 0;
|
||||||
|
const pct = Math.min(100, Math.round((loaded / total) * 100));
|
||||||
|
setProgress(pct);
|
||||||
|
}
|
||||||
|
setProgress(100);
|
||||||
|
} else {
|
||||||
|
// Fallback to FileReader progress events
|
||||||
|
const reader = new FileReader();
|
||||||
|
reader.onprogress = (evt) => {
|
||||||
|
if (evt.lengthComputable) {
|
||||||
|
const pct = Math.min(100, Math.round((evt.loaded / evt.total) * 100));
|
||||||
|
setProgress(pct);
|
||||||
|
} else {
|
||||||
|
setProgress((p) => (p < 90 ? p + 5 : p));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
reader.onloadend = () => setProgress(100);
|
||||||
|
reader.onerror = () => setProgress(0);
|
||||||
|
reader.readAsArrayBuffer(f);
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
setProgress(100);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const reader = new FileReader();
|
||||||
|
reader.onprogress = (evt) => {
|
||||||
|
if (evt.lengthComputable) {
|
||||||
|
const pct = Math.min(100, Math.round((evt.loaded / evt.total) * 100));
|
||||||
|
setProgress(pct);
|
||||||
|
} else {
|
||||||
|
setProgress((p) => (p < 90 ? p + 5 : p));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
reader.onload = async () => {
|
||||||
|
try {
|
||||||
|
const buf = reader.result as ArrayBuffer;
|
||||||
|
const decoder = new TextDecoder();
|
||||||
|
const text = decoder.decode(buf);
|
||||||
|
const metaObj: UploadedFileMeta = {
|
||||||
|
name: f.name,
|
||||||
|
size: f.size,
|
||||||
|
type: f.type || "unknown",
|
||||||
|
contentPreview: text.slice(0, 4000),
|
||||||
|
};
|
||||||
|
setFileMeta(metaObj);
|
||||||
|
if (isCSV) {
|
||||||
|
const parsed = tryParseCSV(text);
|
||||||
|
setTablePreview(parsed);
|
||||||
|
} else {
|
||||||
|
setTablePreview(null);
|
||||||
|
}
|
||||||
|
// Save file blob and meta to browser cache (IndexedDB)
|
||||||
|
try {
|
||||||
|
await saveLatestUpload(f, metaObj);
|
||||||
|
} catch {}
|
||||||
|
setProgressLabel("Processing");
|
||||||
|
setProgress(100);
|
||||||
|
} catch (e) {
|
||||||
|
const metaObj: UploadedFileMeta = {
|
||||||
|
name: f.name,
|
||||||
|
size: f.size,
|
||||||
|
type: f.type || "unknown",
|
||||||
|
contentPreview: "Unable to decode preview.",
|
||||||
|
};
|
||||||
|
setFileMeta(metaObj);
|
||||||
|
setTablePreview(null);
|
||||||
|
try {
|
||||||
|
await saveLatestUpload(f, metaObj);
|
||||||
|
} catch {}
|
||||||
|
setProgressLabel("Processing");
|
||||||
|
setProgress(100);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
reader.onerror = () => {
|
||||||
|
setProgress(0);
|
||||||
|
};
|
||||||
|
reader.readAsArrayBuffer(f);
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
function handleFileChange(e: React.ChangeEvent<HTMLInputElement>) {
|
||||||
|
const f = e.target.files?.[0];
|
||||||
|
processFile(f as File);
|
||||||
|
}
|
||||||
|
|
||||||
|
const onDragOver = (e: React.DragEvent<HTMLDivElement>) => {
|
||||||
|
e.preventDefault();
|
||||||
|
setIsDragging(true);
|
||||||
|
};
|
||||||
|
const onDragLeave = () => setIsDragging(false);
|
||||||
|
const onDrop = (e: React.DragEvent<HTMLDivElement>) => {
|
||||||
|
e.preventDefault();
|
||||||
|
setIsDragging(false);
|
||||||
|
const f = e.dataTransfer.files?.[0];
|
||||||
|
processFile(f as File);
|
||||||
|
};
|
||||||
|
|
||||||
|
// Load last cached upload on mount (processing tab only)
|
||||||
|
useEffect(() => {
|
||||||
|
let ignore = false;
|
||||||
|
if (tab !== "processing") return;
|
||||||
|
(async () => {
|
||||||
|
try {
|
||||||
|
const { file, meta } = await getLatestUpload();
|
||||||
|
if (!ignore && meta) {
|
||||||
|
setFileMeta(meta as UploadedFileMeta);
|
||||||
|
if (file) {
|
||||||
|
setUploadedFile(file);
|
||||||
|
}
|
||||||
|
setLoadedFromCache(true);
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
})();
|
||||||
|
return () => {
|
||||||
|
ignore = true;
|
||||||
|
};
|
||||||
|
}, [tab]); function renderTabContent() {
|
||||||
|
switch (tab) {
|
||||||
|
case "processing":
|
||||||
|
return (
|
||||||
|
<div className="space-y-4 max-w-[1100px] xl:max-w-[1200px] w-full mx-auto">
|
||||||
|
<h2 className="text-xl font-semibold">Upload & Process Data</h2>
|
||||||
|
<p className="text-sm text-slate-600">Upload a CSV / JSON / text file. We will later parse, detect PII, and queue analyses.</p>
|
||||||
|
<div className="flex flex-col gap-3 min-w-0">
|
||||||
|
<div
|
||||||
|
onDragOver={onDragOver}
|
||||||
|
onDragLeave={onDragLeave}
|
||||||
|
onDrop={onDrop}
|
||||||
|
className={
|
||||||
|
"rounded-lg border-2 border-dashed p-6 text-center transition-colors " +
|
||||||
|
(isDragging ? "border-brand-600 bg-brand-50" : "border-slate-300 hover:border-brand-300")
|
||||||
|
}
|
||||||
|
>
|
||||||
|
<p className="text-sm text-slate-600">Drag & drop a CSV / JSON / TXT here, or click to browse.</p>
|
||||||
|
<div className="mt-3">
|
||||||
|
<button
|
||||||
|
type="button"
|
||||||
|
onClick={() => inputRef.current?.click()}
|
||||||
|
className="inline-flex items-center rounded-md bg-brand-600 px-4 py-2 text-white text-sm font-medium shadow hover:bg-brand-500"
|
||||||
|
>
|
||||||
|
Choose file
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<input
|
||||||
|
ref={inputRef}
|
||||||
|
type="file"
|
||||||
|
accept=".csv,.json,.txt"
|
||||||
|
onChange={handleFileChange}
|
||||||
|
className="hidden"
|
||||||
|
aria-hidden
|
||||||
|
/>
|
||||||
|
{progress > 0 && (
|
||||||
|
<div className="w-full">
|
||||||
|
<div className="h-2 w-full rounded-full bg-slate-200 overflow-hidden">
|
||||||
|
<div
|
||||||
|
className="h-2 bg-brand-600 transition-all"
|
||||||
|
style={{ width: `${progress}%` }}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
<div className="mt-1 text-xs text-slate-500">{progressLabel} {progress}%</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
{fileMeta && (
|
||||||
|
<div className="rounded-md border border-slate-200 p-4 bg-white shadow-sm">
|
||||||
|
<div className="flex items-center justify-between mb-2">
|
||||||
|
<div className="text-sm font-medium">{fileMeta.name}</div>
|
||||||
|
<div className="text-xs text-slate-500">{Math.round(fileMeta.size / 1024)} KB</div>
|
||||||
|
</div>
|
||||||
|
{loadedFromCache && (
|
||||||
|
<div className="mb-2 text-[11px] text-brand-700">Loaded from browser cache</div>
|
||||||
|
)}
|
||||||
|
<div className="mb-3 text-xs text-slate-500">{fileMeta.type || "Unknown type"}</div>
|
||||||
|
{/* Table preview when structured data detected; otherwise show text */}
|
||||||
|
{tablePreview && tablePreview.origin === 'csv' ? (
|
||||||
|
<div className="max-h-64 w-full min-w-0 overflow-x-auto overflow-y-auto rounded-md bg-slate-50">
|
||||||
|
<table className="min-w-full text-xs">
|
||||||
|
<thead className="sticky top-0 bg-slate-100">
|
||||||
|
<tr>
|
||||||
|
{tablePreview.headers.map((h, idx) => (
|
||||||
|
<th key={idx} className="text-left font-semibold px-3 py-2 border-b border-slate-200 whitespace-nowrap">{h}</th>
|
||||||
|
))}
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{tablePreview.rows.map((r, i) => (
|
||||||
|
<tr key={i} className={i % 2 === 0 ? "" : "bg-slate-100/50"}>
|
||||||
|
{r.map((c, j) => (
|
||||||
|
<td key={j} className="px-3 py-1.5 border-b border-slate-100 whitespace-nowrap max-w-[24ch] overflow-hidden text-ellipsis">{c}</td>
|
||||||
|
))}
|
||||||
|
</tr>
|
||||||
|
))}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<pre className="max-h-64 overflow-auto text-xs bg-slate-50 p-3 rounded-md whitespace-pre-wrap leading-relaxed">
|
||||||
|
{fileMeta.contentPreview || "(no preview)"}
|
||||||
|
</pre>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{error && (
|
||||||
|
<div className="mt-3 p-3 bg-red-50 border border-red-200 rounded-md text-sm text-red-700">
|
||||||
|
❌ {error}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{analyzeResult && (
|
||||||
|
<div className="mt-3 p-3 bg-green-50 border border-green-200 rounded-md text-sm text-green-700">
|
||||||
|
✅ Analysis complete! View results in tabs.
|
||||||
|
<a
|
||||||
|
href={getReportUrl(analyzeResult.report_file)}
|
||||||
|
target="_blank"
|
||||||
|
rel="noopener noreferrer"
|
||||||
|
className="ml-2 underline"
|
||||||
|
>
|
||||||
|
Download Report
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{cleanResult && (
|
||||||
|
<div className="mt-3 p-3 bg-green-50 border border-green-200 rounded-md text-sm text-green-700">
|
||||||
|
✅ Cleaning complete! {cleanResult.summary.total_cells_affected} cells anonymized.
|
||||||
|
<div className="mt-2 flex gap-2">
|
||||||
|
<a
|
||||||
|
href={getReportUrl(cleanResult.files.cleaned_csv)}
|
||||||
|
download
|
||||||
|
className="underline"
|
||||||
|
>
|
||||||
|
Download Cleaned CSV
|
||||||
|
</a>
|
||||||
|
<a
|
||||||
|
href={getReportUrl(cleanResult.files.audit_report)}
|
||||||
|
target="_blank"
|
||||||
|
rel="noopener noreferrer"
|
||||||
|
className="underline"
|
||||||
|
>
|
||||||
|
View Audit Report
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
<div className="mt-3 flex justify-end gap-2">
|
||||||
|
<button
|
||||||
|
type="button"
|
||||||
|
onClick={async () => {
|
||||||
|
reset();
|
||||||
|
try { await deleteLatestUpload(); } catch {}
|
||||||
|
setLoadedFromCache(false);
|
||||||
|
setAnalyzeResult(null);
|
||||||
|
setCleanResult(null);
|
||||||
|
}}
|
||||||
|
className="text-xs rounded-md border px-3 py-1.5 hover:bg-slate-50"
|
||||||
|
>
|
||||||
|
Clear
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
type="button"
|
||||||
|
onClick={handleClean}
|
||||||
|
disabled={isProcessing}
|
||||||
|
className="text-xs rounded-md bg-green-600 text-white px-3 py-1.5 hover:bg-green-500 disabled:opacity-50 disabled:cursor-not-allowed"
|
||||||
|
>
|
||||||
|
{isProcessing ? "Processing..." : "Clean (PII)"}
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
type="button"
|
||||||
|
onClick={handleAnalyze}
|
||||||
|
disabled={isProcessing}
|
||||||
|
className="text-xs rounded-md bg-brand-600 text-white px-3 py-1.5 hover:bg-brand-500 disabled:opacity-50 disabled:cursor-not-allowed"
|
||||||
|
>
|
||||||
|
{isProcessing ? "Processing..." : "Analyze"}
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
case "bias-analysis":
|
||||||
|
return (
|
||||||
|
<div className="space-y-4">
|
||||||
|
<h2 className="text-xl font-semibold">Bias Analysis</h2>
|
||||||
|
{analyzeResult ? (
|
||||||
|
<div className="space-y-4">
|
||||||
|
<div className="grid grid-cols-2 gap-4">
|
||||||
|
<div className="p-4 bg-white rounded-lg border">
|
||||||
|
<div className="text-sm text-slate-600">Overall Bias Score</div>
|
||||||
|
<div className="text-2xl font-bold">{(analyzeResult.bias_metrics.overall_bias_score * 100).toFixed(1)}%</div>
|
||||||
|
</div>
|
||||||
|
<div className="p-4 bg-white rounded-lg border">
|
||||||
|
<div className="text-sm text-slate-600">Violations Detected</div>
|
||||||
|
<div className="text-2xl font-bold">{analyzeResult.bias_metrics.violations_detected.length}</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="p-4 bg-white rounded-lg border">
|
||||||
|
<h3 className="font-semibold mb-2">Model Performance</h3>
|
||||||
|
<div className="grid grid-cols-4 gap-2 text-sm">
|
||||||
|
<div>
|
||||||
|
<div className="text-slate-600">Accuracy</div>
|
||||||
|
<div className="font-medium">{(analyzeResult.model_performance.accuracy * 100).toFixed(1)}%</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<div className="text-slate-600">Precision</div>
|
||||||
|
<div className="font-medium">{(analyzeResult.model_performance.precision * 100).toFixed(1)}%</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<div className="text-slate-600">Recall</div>
|
||||||
|
<div className="font-medium">{(analyzeResult.model_performance.recall * 100).toFixed(1)}%</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<div className="text-slate-600">F1 Score</div>
|
||||||
|
<div className="font-medium">{(analyzeResult.model_performance.f1_score * 100).toFixed(1)}%</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<p className="text-sm text-slate-600">Upload and analyze a dataset to see bias metrics.</p>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
case "risk-analysis":
|
||||||
|
return (
|
||||||
|
<div className="space-y-4">
|
||||||
|
<h2 className="text-xl font-semibold">Risk Analysis</h2>
|
||||||
|
{analyzeResult ? (
|
||||||
|
<div className="space-y-4">
|
||||||
|
<div className="p-4 bg-white rounded-lg border">
|
||||||
|
<div className="text-sm text-slate-600">Overall Risk Score</div>
|
||||||
|
<div className="text-2xl font-bold">{(analyzeResult.risk_assessment.overall_risk_score * 100).toFixed(1)}%</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{cleanResult && (
|
||||||
|
<div className="p-4 bg-white rounded-lg border">
|
||||||
|
<h3 className="font-semibold mb-2">PII Detection Results</h3>
|
||||||
|
<div className="text-sm space-y-1">
|
||||||
|
<div>Cells Anonymized: <span className="font-medium">{cleanResult.summary.total_cells_affected}</span></div>
|
||||||
|
<div>Columns Removed: <span className="font-medium">{cleanResult.summary.columns_removed.length}</span></div>
|
||||||
|
<div>Columns Anonymized: <span className="font-medium">{cleanResult.summary.columns_anonymized.length}</span></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<p className="text-sm text-slate-600">Upload and analyze a dataset to see risk assessment.</p>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
case "bias-risk-mitigation":
|
||||||
|
return (
|
||||||
|
<div className="space-y-4">
|
||||||
|
<h2 className="text-xl font-semibold">Mitigation Suggestions</h2>
|
||||||
|
{analyzeResult && analyzeResult.recommendations.length > 0 ? (
|
||||||
|
<div className="space-y-2">
|
||||||
|
{analyzeResult.recommendations.map((rec, i) => (
|
||||||
|
<div key={i} className="p-3 bg-blue-50 border border-blue-200 rounded-md text-sm">
|
||||||
|
{rec}
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<p className="text-sm text-slate-600">
|
||||||
|
Recommendations will appear here after analysis.
|
||||||
|
</p>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
case "results":
|
||||||
|
return (
|
||||||
|
<div className="space-y-4">
|
||||||
|
<h2 className="text-xl font-semibold">Results Summary</h2>
|
||||||
|
{(analyzeResult || cleanResult) ? (
|
||||||
|
<div className="space-y-4">
|
||||||
|
{analyzeResult && (
|
||||||
|
<div className="p-4 bg-white rounded-lg border">
|
||||||
|
<h3 className="font-semibold mb-2">Analysis Results</h3>
|
||||||
|
<div className="text-sm space-y-1">
|
||||||
|
<div>Dataset: {analyzeResult.filename}</div>
|
||||||
|
<div>Rows: {analyzeResult.dataset_info.rows}</div>
|
||||||
|
<div>Columns: {analyzeResult.dataset_info.columns}</div>
|
||||||
|
<div>Bias Score: {(analyzeResult.bias_metrics.overall_bias_score * 100).toFixed(1)}%</div>
|
||||||
|
<div>Risk Score: {(analyzeResult.risk_assessment.overall_risk_score * 100).toFixed(1)}%</div>
|
||||||
|
</div>
|
||||||
|
<a
|
||||||
|
href={getReportUrl(analyzeResult.report_file)}
|
||||||
|
target="_blank"
|
||||||
|
rel="noopener noreferrer"
|
||||||
|
className="mt-3 inline-block text-sm text-brand-600 underline"
|
||||||
|
>
|
||||||
|
Download Full Report →
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{cleanResult && (
|
||||||
|
<div className="p-4 bg-white rounded-lg border">
|
||||||
|
<h3 className="font-semibold mb-2">Cleaning Results</h3>
|
||||||
|
<div className="text-sm space-y-1">
|
||||||
|
<div>Original: {cleanResult.dataset_info.original_rows} rows × {cleanResult.dataset_info.original_columns} cols</div>
|
||||||
|
<div>Cleaned: {cleanResult.dataset_info.cleaned_rows} rows × {cleanResult.dataset_info.cleaned_columns} cols</div>
|
||||||
|
<div>Cells Anonymized: {cleanResult.summary.total_cells_affected}</div>
|
||||||
|
<div>Columns Removed: {cleanResult.summary.columns_removed.length}</div>
|
||||||
|
<div>GDPR Compliant: {cleanResult.gdpr_compliance.length} articles applied</div>
|
||||||
|
</div>
|
||||||
|
<div className="mt-3 flex gap-2">
|
||||||
|
<a
|
||||||
|
href={getReportUrl(cleanResult.files.cleaned_csv)}
|
||||||
|
download
|
||||||
|
className="text-sm text-brand-600 underline"
|
||||||
|
>
|
||||||
|
Download Cleaned CSV →
|
||||||
|
</a>
|
||||||
|
<a
|
||||||
|
href={getReportUrl(cleanResult.files.audit_report)}
|
||||||
|
target="_blank"
|
||||||
|
rel="noopener noreferrer"
|
||||||
|
className="text-sm text-brand-600 underline"
|
||||||
|
>
|
||||||
|
View Audit Report →
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<p className="text-sm text-slate-600">
|
||||||
|
Process a dataset to see aggregated results.
|
||||||
|
</p>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
default:
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="h-full overflow-y-auto p-6 bg-white/60">
|
||||||
|
{renderTabContent()}
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
"use client";
|
"use client";
|
||||||
import { TryTab } from "./Sidebar";
|
import { TryTab } from "./Sidebar";
|
||||||
import { useState, useRef, useCallback, useEffect } from "react";
|
import { useState, useRef, useCallback, useEffect } from "react";
|
||||||
import { saveLatestUpload, getLatestUpload, deleteLatestUpload } from "../../lib/idb";
|
|
||||||
|
|
||||||
interface CenterPanelProps {
|
interface CenterPanelProps {
|
||||||
tab: TryTab;
|
tab: TryTab;
|
||||||
@@ -1,28 +0,0 @@
|
|||||||
# Nordic Privacy AI
|
|
||||||
|
|
||||||
AI-Powered GDPR compliance & personal data protection platform tailored for Nordic ecosystems (BankID, MitID, Suomi.fi).
|
|
||||||
|
|
||||||
## Tech Stack
|
|
||||||
- Next.js (App Router, TypeScript)
|
|
||||||
- Tailwind CSS
|
|
||||||
|
|
||||||
## Getting Started
|
|
||||||
```powershell
|
|
||||||
npm install
|
|
||||||
npm run dev
|
|
||||||
```
|
|
||||||
Visit http://localhost:3000 to view the landing page.
|
|
||||||
|
|
||||||
## Scripts
|
|
||||||
- `npm run dev` – Start dev server
|
|
||||||
- `npm run build` – Production build
|
|
||||||
- `npm start` – Run built app
|
|
||||||
- `npm run lint` – ESLint
|
|
||||||
|
|
||||||
## Next Steps
|
|
||||||
- Implement /try page workflow
|
|
||||||
- Add feature sections & agent explanations
|
|
||||||
- Integrate backend services for data upload & scanning
|
|
||||||
|
|
||||||
## License
|
|
||||||
Internal hackathon prototype
|
|
||||||
5
frontend/nordic-privacy-ai/next-env.d.ts
vendored
5
frontend/nordic-privacy-ai/next-env.d.ts
vendored
@@ -1,5 +0,0 @@
|
|||||||
/// <reference types="next" />
|
|
||||||
/// <reference types="next/image-types/global" />
|
|
||||||
|
|
||||||
// NOTE: This file should not be edited
|
|
||||||
// see https://nextjs.org/docs/basic-features/typescript for more information.
|
|
||||||
@@ -1001,6 +1001,7 @@
|
|||||||
"integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
|
"integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
"peer": true,
|
||||||
"bin": {
|
"bin": {
|
||||||
"acorn": "bin/acorn"
|
"acorn": "bin/acorn"
|
||||||
},
|
},
|
||||||
@@ -1441,6 +1442,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
"peer": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"baseline-browser-mapping": "^2.8.19",
|
"baseline-browser-mapping": "^2.8.19",
|
||||||
"caniuse-lite": "^1.0.30001751",
|
"caniuse-lite": "^1.0.30001751",
|
||||||
@@ -2103,6 +2105,7 @@
|
|||||||
"deprecated": "This version is no longer supported. Please see https://eslint.org/version-support for other options.",
|
"deprecated": "This version is no longer supported. Please see https://eslint.org/version-support for other options.",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
"peer": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@eslint-community/eslint-utils": "^4.2.0",
|
"@eslint-community/eslint-utils": "^4.2.0",
|
||||||
"@eslint-community/regexpp": "^4.6.1",
|
"@eslint-community/regexpp": "^4.6.1",
|
||||||
@@ -2271,6 +2274,7 @@
|
|||||||
"integrity": "sha512-whOE1HFo/qJDyX4SnXzP4N6zOWn79WhnCUY/iDR0mPfQZO8wcYE4JClzI2oZrhBnnMUCBCHZhO6VQyoBU95mZA==",
|
"integrity": "sha512-whOE1HFo/qJDyX4SnXzP4N6zOWn79WhnCUY/iDR0mPfQZO8wcYE4JClzI2oZrhBnnMUCBCHZhO6VQyoBU95mZA==",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
"peer": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@rtsao/scc": "^1.1.0",
|
"@rtsao/scc": "^1.1.0",
|
||||||
"array-includes": "^3.1.9",
|
"array-includes": "^3.1.9",
|
||||||
@@ -4423,6 +4427,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
"peer": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"nanoid": "^3.3.7",
|
"nanoid": "^3.3.7",
|
||||||
"picocolors": "^1.0.0",
|
"picocolors": "^1.0.0",
|
||||||
@@ -4581,6 +4586,7 @@
|
|||||||
"resolved": "https://registry.npmjs.org/react/-/react-18.2.0.tgz",
|
"resolved": "https://registry.npmjs.org/react/-/react-18.2.0.tgz",
|
||||||
"integrity": "sha512-/3IjMdb2L9QbBdWiW5e3P2/npwMBaU9mHCSCUzNln0ZCYbcfTsGbTJrU/kGemdH2IWmB2ioZ+zkxtmq6g09fGQ==",
|
"integrity": "sha512-/3IjMdb2L9QbBdWiW5e3P2/npwMBaU9mHCSCUzNln0ZCYbcfTsGbTJrU/kGemdH2IWmB2ioZ+zkxtmq6g09fGQ==",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
"peer": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"loose-envify": "^1.1.0"
|
"loose-envify": "^1.1.0"
|
||||||
},
|
},
|
||||||
@@ -4593,6 +4599,7 @@
|
|||||||
"resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.2.0.tgz",
|
"resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.2.0.tgz",
|
||||||
"integrity": "sha512-6IMTriUmvsjHUjNtEDudZfuDQUoWXVxKHhlEGSk81n4YFS+r/Kl99wXiwlVXtPBtJenozv2P+hxDsw9eA7Xo6g==",
|
"integrity": "sha512-6IMTriUmvsjHUjNtEDudZfuDQUoWXVxKHhlEGSk81n4YFS+r/Kl99wXiwlVXtPBtJenozv2P+hxDsw9eA7Xo6g==",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
"peer": true,
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"loose-envify": "^1.1.0",
|
"loose-envify": "^1.1.0",
|
||||||
"scheduler": "^0.23.0"
|
"scheduler": "^0.23.0"
|
||||||
@@ -5539,6 +5546,7 @@
|
|||||||
"integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
|
"integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
"peer": true,
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">=12"
|
"node": ">=12"
|
||||||
},
|
},
|
||||||
@@ -5708,6 +5716,7 @@
|
|||||||
"integrity": "sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==",
|
"integrity": "sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
"license": "Apache-2.0",
|
"license": "Apache-2.0",
|
||||||
|
"peer": true,
|
||||||
"bin": {
|
"bin": {
|
||||||
"tsc": "bin/tsc",
|
"tsc": "bin/tsc",
|
||||||
"tsserver": "bin/tsserver"
|
"tsserver": "bin/tsserver"
|
||||||
@@ -9,4 +9,15 @@ setuptools>=65.0.0
|
|||||||
presidio-analyzer>=2.2.0
|
presidio-analyzer>=2.2.0
|
||||||
presidio-anonymizer>=2.2.0
|
presidio-anonymizer>=2.2.0
|
||||||
spacy>=3.7.0
|
spacy>=3.7.0
|
||||||
# Download spaCy model with: python -m spacy download en_core_web_sm
|
# Download spaCy model with: python -m spacy download en_core_web_sm
|
||||||
|
|
||||||
|
# FastAPI Backend
|
||||||
|
fastapi>=0.109.0
|
||||||
|
uvicorn[standard]>=0.27.0
|
||||||
|
python-multipart>=0.0.6
|
||||||
|
|
||||||
|
# Optional: GPU Support (uncomment if you have CUDA)
|
||||||
|
# torch>=2.0.0 --index-url https://download.pytorch.org/whl/cu121
|
||||||
|
|
||||||
|
# Chatbot (WIP - not exposed in API yet)
|
||||||
|
gpt4all>=2.0.0
|
||||||
145
src/__init__.py
145
src/__init__.py
@@ -1,145 +0,0 @@
|
|||||||
"""
|
|
||||||
AI Governance Module - Bias Detection and Risk Analysis
|
|
||||||
"""
|
|
||||||
|
|
||||||
from .data_processor import DataProcessor
|
|
||||||
from .model_trainer import GeneralizedModelTrainer
|
|
||||||
from .bias_analyzer import BiasAnalyzer
|
|
||||||
from .risk_analyzer import RiskAnalyzer
|
|
||||||
from .report_generator import ReportGenerator, NumpyEncoder
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import json
|
|
||||||
|
|
||||||
__version__ = '1.0.0'
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
'DataProcessor',
|
|
||||||
'GeneralizedModelTrainer',
|
|
||||||
'BiasAnalyzer',
|
|
||||||
'RiskAnalyzer',
|
|
||||||
'ReportGenerator',
|
|
||||||
'NumpyEncoder',
|
|
||||||
'AIGovernanceAnalyzer'
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class AIGovernanceAnalyzer:
|
|
||||||
"""
|
|
||||||
Main interface for AI Governance analysis
|
|
||||||
|
|
||||||
Example:
|
|
||||||
>>> analyzer = AIGovernanceAnalyzer()
|
|
||||||
>>> report = analyzer.analyze('data.csv', 'target', ['gender', 'age'])
|
|
||||||
>>> print(f"Bias Score: {report['summary']['overall_bias_score']:.3f}")
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
"""Initialize the analyzer"""
|
|
||||||
self.processor = None
|
|
||||||
self.trainer = None
|
|
||||||
self.bias_analyzer = None
|
|
||||||
self.risk_analyzer = None
|
|
||||||
self.report_generator = None
|
|
||||||
|
|
||||||
def analyze(self, data_path, target_column, protected_attributes):
|
|
||||||
"""
|
|
||||||
Run complete AI governance analysis from file
|
|
||||||
|
|
||||||
Args:
|
|
||||||
data_path (str): Path to CSV file
|
|
||||||
target_column (str): Name of target column
|
|
||||||
protected_attributes (list): List of protected attribute column names
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict: Complete analysis report
|
|
||||||
"""
|
|
||||||
df = pd.read_csv(data_path)
|
|
||||||
return self.analyze_dataframe(df, target_column, protected_attributes)
|
|
||||||
|
|
||||||
def analyze_dataframe(self, df, target_column, protected_attributes):
|
|
||||||
"""
|
|
||||||
Run complete AI governance analysis from DataFrame
|
|
||||||
|
|
||||||
Args:
|
|
||||||
df (pd.DataFrame): Input dataframe
|
|
||||||
target_column (str): Name of target column
|
|
||||||
protected_attributes (list): List of protected attribute column names
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict: Complete analysis report
|
|
||||||
"""
|
|
||||||
# Step 1: Process data
|
|
||||||
self.processor = DataProcessor(df)
|
|
||||||
self.processor.target_column = target_column
|
|
||||||
self.processor.protected_attributes = protected_attributes
|
|
||||||
self.processor.prepare_data()
|
|
||||||
|
|
||||||
# Step 2: Train model
|
|
||||||
self.trainer = GeneralizedModelTrainer(
|
|
||||||
self.processor.X_train,
|
|
||||||
self.processor.X_test,
|
|
||||||
self.processor.y_train,
|
|
||||||
self.processor.y_test,
|
|
||||||
self.processor.feature_names
|
|
||||||
)
|
|
||||||
self.trainer.train()
|
|
||||||
self.trainer.evaluate()
|
|
||||||
|
|
||||||
# Step 3: Analyze bias
|
|
||||||
self.bias_analyzer = BiasAnalyzer(
|
|
||||||
self.processor.X_test,
|
|
||||||
self.processor.y_test,
|
|
||||||
self.trainer.y_pred,
|
|
||||||
self.processor.df,
|
|
||||||
self.processor.protected_attributes,
|
|
||||||
self.processor.target_column
|
|
||||||
)
|
|
||||||
bias_results = self.bias_analyzer.analyze()
|
|
||||||
|
|
||||||
# Step 4: Assess risks
|
|
||||||
self.risk_analyzer = RiskAnalyzer(
|
|
||||||
self.processor.df,
|
|
||||||
self.trainer.results,
|
|
||||||
bias_results,
|
|
||||||
self.processor.protected_attributes,
|
|
||||||
self.processor.target_column
|
|
||||||
)
|
|
||||||
risk_results = self.risk_analyzer.analyze()
|
|
||||||
|
|
||||||
# Step 5: Generate report
|
|
||||||
self.report_generator = ReportGenerator(
|
|
||||||
self.trainer.results,
|
|
||||||
bias_results,
|
|
||||||
risk_results,
|
|
||||||
self.processor.df
|
|
||||||
)
|
|
||||||
|
|
||||||
return self.report_generator.generate_report()
|
|
||||||
|
|
||||||
def save_report(self, report, output_path):
|
|
||||||
"""
|
|
||||||
Save report to JSON file
|
|
||||||
|
|
||||||
Args:
|
|
||||||
report (dict): Analysis report
|
|
||||||
output_path (str): Path to save JSON file
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: Path to saved file
|
|
||||||
"""
|
|
||||||
with open(output_path, 'w') as f:
|
|
||||||
json.dump(report, f, indent=2, cls=NumpyEncoder)
|
|
||||||
return output_path
|
|
||||||
|
|
||||||
def get_summary(self, report):
|
|
||||||
"""
|
|
||||||
Get executive summary from report
|
|
||||||
|
|
||||||
Args:
|
|
||||||
report (dict): Analysis report
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict: Summary metrics
|
|
||||||
"""
|
|
||||||
return report.get('summary', {})
|
|
||||||
@@ -1,263 +0,0 @@
|
|||||||
"""
|
|
||||||
Report Generator Module
|
|
||||||
Generates comprehensive JSON reports
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import numpy as np
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
class NumpyEncoder(json.JSONEncoder):
|
|
||||||
"""Custom JSON encoder for numpy types"""
|
|
||||||
def default(self, obj):
|
|
||||||
if isinstance(obj, (np.integer, np.int64, np.int32)):
|
|
||||||
return int(obj)
|
|
||||||
elif isinstance(obj, (np.floating, np.float64, np.float32)):
|
|
||||||
return float(obj)
|
|
||||||
elif isinstance(obj, (np.ndarray,)):
|
|
||||||
return obj.tolist()
|
|
||||||
elif isinstance(obj, (np.bool_,)):
|
|
||||||
return bool(obj)
|
|
||||||
return super(NumpyEncoder, self).default(obj)
|
|
||||||
|
|
||||||
class ReportGenerator:
|
|
||||||
"""Generate comprehensive analysis reports"""
|
|
||||||
|
|
||||||
def __init__(self, model_results, bias_results, risk_results, df):
|
|
||||||
self.model_results = model_results
|
|
||||||
self.bias_results = bias_results
|
|
||||||
self.risk_results = risk_results
|
|
||||||
self.df = df
|
|
||||||
|
|
||||||
def generate_report(self):
|
|
||||||
"""Generate comprehensive JSON report"""
|
|
||||||
report = {
|
|
||||||
'metadata': self._generate_metadata(),
|
|
||||||
'summary': self._generate_summary(),
|
|
||||||
'model_performance': self._format_model_results(),
|
|
||||||
'bias_analysis': self._format_bias_results(),
|
|
||||||
'risk_assessment': self._format_risk_results(),
|
|
||||||
'key_findings': self._extract_key_findings(),
|
|
||||||
'recommendations': self._compile_recommendations(),
|
|
||||||
'detailed_metrics': self._compile_detailed_metrics()
|
|
||||||
}
|
|
||||||
|
|
||||||
return report
|
|
||||||
|
|
||||||
def _generate_metadata(self):
|
|
||||||
"""Generate report metadata"""
|
|
||||||
return {
|
|
||||||
'report_id': f"AIGov_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
|
|
||||||
'generated_at': datetime.now().isoformat(),
|
|
||||||
'report_version': '1.0',
|
|
||||||
'dataset_info': {
|
|
||||||
'total_records': len(self.df),
|
|
||||||
'total_features': len(self.df.columns),
|
|
||||||
'columns': list(self.df.columns)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def _generate_summary(self):
|
|
||||||
"""Generate executive summary"""
|
|
||||||
model_metrics = self.model_results.get('metrics', {})
|
|
||||||
|
|
||||||
return {
|
|
||||||
'overall_bias_score': self.bias_results.get('overall_bias_score', 0.0),
|
|
||||||
'overall_risk_score': self.risk_results.get('overall_risk_score', 0.0),
|
|
||||||
'risk_level': self.risk_results.get('risk_level', 'UNKNOWN'),
|
|
||||||
'model_accuracy': model_metrics.get('accuracy', 0.0),
|
|
||||||
'fairness_violations_count': len(self.bias_results.get('fairness_violations', [])),
|
|
||||||
'passes_fairness_threshold': self.bias_results.get('fairness_assessment', {}).get('passes_fairness_threshold', False)
|
|
||||||
}
|
|
||||||
|
|
||||||
def _format_model_results(self):
|
|
||||||
"""Format model performance results"""
|
|
||||||
return {
|
|
||||||
'model_type': self.model_results.get('model_type', 'Unknown'),
|
|
||||||
'metrics': self.model_results.get('metrics', {}),
|
|
||||||
'confusion_matrix': self.model_results.get('confusion_matrix', []),
|
|
||||||
'top_features': dict(list(self.model_results.get('feature_importance', {}).items())[:10])
|
|
||||||
}
|
|
||||||
|
|
||||||
def _format_bias_results(self):
|
|
||||||
"""Format bias analysis results"""
|
|
||||||
return {
|
|
||||||
'overall_bias_score': self.bias_results.get('overall_bias_score', 0.0),
|
|
||||||
'fairness_metrics': self.bias_results.get('fairness_metrics', {}),
|
|
||||||
'fairness_violations': self.bias_results.get('fairness_violations', []),
|
|
||||||
'fairness_assessment': self.bias_results.get('fairness_assessment', {}),
|
|
||||||
'demographic_bias_summary': self._summarize_demographic_bias()
|
|
||||||
}
|
|
||||||
|
|
||||||
def _format_risk_results(self):
|
|
||||||
"""Format risk assessment results"""
|
|
||||||
return {
|
|
||||||
'overall_risk_score': self.risk_results.get('overall_risk_score', 0.0),
|
|
||||||
'risk_level': self.risk_results.get('risk_level', 'UNKNOWN'),
|
|
||||||
'risk_categories': self.risk_results.get('risk_categories', {}),
|
|
||||||
'privacy_risks': self._summarize_privacy_risks(),
|
|
||||||
'ethical_risks': self._summarize_ethical_risks()
|
|
||||||
}
|
|
||||||
|
|
||||||
def _summarize_demographic_bias(self):
|
|
||||||
"""Summarize demographic bias"""
|
|
||||||
demo_bias = self.bias_results.get('demographic_bias', {})
|
|
||||||
summary = {}
|
|
||||||
|
|
||||||
for attr, data in demo_bias.items():
|
|
||||||
summary[attr] = {
|
|
||||||
'max_disparity': data.get('max_disparity', 0),
|
|
||||||
'groups_analyzed': len(data.get('approval_rates', {}))
|
|
||||||
}
|
|
||||||
|
|
||||||
return summary
|
|
||||||
|
|
||||||
def _summarize_privacy_risks(self):
|
|
||||||
"""Summarize privacy risks"""
|
|
||||||
privacy = self.risk_results.get('privacy_risks', {})
|
|
||||||
|
|
||||||
return {
|
|
||||||
'pii_count': len(privacy.get('pii_detected', [])),
|
|
||||||
'anonymization_level': privacy.get('anonymization_level', 'UNKNOWN'),
|
|
||||||
'exposure_risk_count': len(privacy.get('exposure_risks', [])),
|
|
||||||
'gdpr_compliance_score': privacy.get('gdpr_compliance', {}).get('compliance_score', 0)
|
|
||||||
}
|
|
||||||
|
|
||||||
def _summarize_ethical_risks(self):
|
|
||||||
"""Summarize ethical risks"""
|
|
||||||
ethical = self.risk_results.get('ethical_risks', {})
|
|
||||||
|
|
||||||
return {
|
|
||||||
'fairness_issues_count': len(ethical.get('fairness_issues', [])),
|
|
||||||
'transparency_score': ethical.get('transparency_score', 0),
|
|
||||||
'bias_amplification_risk': ethical.get('bias_amplification_risk', 'UNKNOWN'),
|
|
||||||
'social_impact': ethical.get('social_impact_assessment', {})
|
|
||||||
}
|
|
||||||
|
|
||||||
def _extract_key_findings(self):
|
|
||||||
"""Extract key findings from analysis"""
|
|
||||||
findings = []
|
|
||||||
|
|
||||||
# Model performance findings
|
|
||||||
accuracy = self.model_results.get('metrics', {}).get('accuracy', 0)
|
|
||||||
if accuracy >= 0.8:
|
|
||||||
findings.append(f"✓ Model achieves good accuracy ({accuracy:.2%})")
|
|
||||||
else:
|
|
||||||
findings.append(f"⚠ Model accuracy is below optimal ({accuracy:.2%})")
|
|
||||||
|
|
||||||
# Bias findings
|
|
||||||
bias_score = self.bias_results.get('overall_bias_score', 0)
|
|
||||||
if bias_score < 0.3:
|
|
||||||
findings.append("✓ Low bias detected across protected attributes")
|
|
||||||
elif bias_score < 0.5:
|
|
||||||
findings.append("⚠ Moderate bias detected - monitoring recommended")
|
|
||||||
else:
|
|
||||||
findings.append("❌ High bias detected - immediate action required")
|
|
||||||
|
|
||||||
# Fairness violations
|
|
||||||
violations = self.bias_results.get('fairness_violations', [])
|
|
||||||
if violations:
|
|
||||||
high_sev = sum(1 for v in violations if v['severity'] == 'HIGH')
|
|
||||||
findings.append(f"❌ {len(violations)} fairness violations detected ({high_sev} high severity)")
|
|
||||||
else:
|
|
||||||
findings.append("✓ No fairness violations detected")
|
|
||||||
|
|
||||||
# Privacy findings
|
|
||||||
privacy = self.risk_results.get('privacy_risks', {})
|
|
||||||
pii_count = len(privacy.get('pii_detected', []))
|
|
||||||
if pii_count > 0:
|
|
||||||
findings.append(f"⚠ {pii_count} columns contain potential PII")
|
|
||||||
else:
|
|
||||||
findings.append("✓ No obvious PII detected in dataset")
|
|
||||||
|
|
||||||
# Risk level
|
|
||||||
risk_level = self.risk_results.get('risk_level', 'UNKNOWN')
|
|
||||||
findings.append(f"Overall Risk Level: {risk_level}")
|
|
||||||
|
|
||||||
return findings
|
|
||||||
|
|
||||||
def _compile_recommendations(self):
|
|
||||||
"""Compile all recommendations"""
|
|
||||||
recommendations = []
|
|
||||||
|
|
||||||
# Get recommendations from each component
|
|
||||||
privacy_recs = self.risk_results.get('privacy_risks', {}).get('recommendations', [])
|
|
||||||
ethical_recs = self.risk_results.get('ethical_risks', {}).get('recommendations', [])
|
|
||||||
performance_recs = self.risk_results.get('model_performance_risks', {}).get('recommendations', [])
|
|
||||||
compliance_recs = self.risk_results.get('compliance_risks', {}).get('recommendations', [])
|
|
||||||
|
|
||||||
# Prioritize recommendations
|
|
||||||
all_recs = []
|
|
||||||
|
|
||||||
# High priority (from violations and high risks)
|
|
||||||
violations = self.bias_results.get('fairness_violations', [])
|
|
||||||
if violations:
|
|
||||||
all_recs.append({
|
|
||||||
'priority': 'HIGH',
|
|
||||||
'category': 'Fairness',
|
|
||||||
'recommendation': 'Address fairness violations in protected attributes'
|
|
||||||
})
|
|
||||||
|
|
||||||
if len(privacy_recs) > 0:
|
|
||||||
all_recs.append({
|
|
||||||
'priority': 'HIGH',
|
|
||||||
'category': 'Privacy',
|
|
||||||
'recommendation': privacy_recs[0]
|
|
||||||
})
|
|
||||||
|
|
||||||
# Medium priority
|
|
||||||
for rec in ethical_recs[:2]:
|
|
||||||
all_recs.append({
|
|
||||||
'priority': 'MEDIUM',
|
|
||||||
'category': 'Ethics',
|
|
||||||
'recommendation': rec
|
|
||||||
})
|
|
||||||
|
|
||||||
# Lower priority
|
|
||||||
for rec in performance_recs[:2]:
|
|
||||||
all_recs.append({
|
|
||||||
'priority': 'MEDIUM',
|
|
||||||
'category': 'Performance',
|
|
||||||
'recommendation': rec
|
|
||||||
})
|
|
||||||
|
|
||||||
for rec in compliance_recs[:2]:
|
|
||||||
all_recs.append({
|
|
||||||
'priority': 'MEDIUM',
|
|
||||||
'category': 'Compliance',
|
|
||||||
'recommendation': rec
|
|
||||||
})
|
|
||||||
|
|
||||||
# Convert to simple list with formatting
|
|
||||||
recommendations = [
|
|
||||||
f"[{r['priority']}] {r['category']}: {r['recommendation']}"
|
|
||||||
for r in all_recs[:10] # Limit to top 10
|
|
||||||
]
|
|
||||||
|
|
||||||
return recommendations
|
|
||||||
|
|
||||||
def _compile_detailed_metrics(self):
|
|
||||||
"""Compile detailed metrics for analysis"""
|
|
||||||
return {
|
|
||||||
'bias_metrics': {
|
|
||||||
'by_attribute': self.bias_results.get('fairness_metrics', {}),
|
|
||||||
'demographic_analysis': self.bias_results.get('demographic_bias', {})
|
|
||||||
},
|
|
||||||
'risk_breakdown': {
|
|
||||||
'privacy': self.risk_results.get('privacy_risks', {}),
|
|
||||||
'ethical': self.risk_results.get('ethical_risks', {}),
|
|
||||||
'compliance': self.risk_results.get('compliance_risks', {}),
|
|
||||||
'data_quality': self.risk_results.get('data_quality_risks', {})
|
|
||||||
},
|
|
||||||
'model_details': {
|
|
||||||
'classification_report': self.model_results.get('classification_report', {}),
|
|
||||||
'feature_importance': self.model_results.get('feature_importance', {})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
def save_report(self, filepath):
|
|
||||||
"""Save report to JSON file"""
|
|
||||||
report = self.generate_report()
|
|
||||||
with open(filepath, 'w') as f:
|
|
||||||
json.dump(report, f, indent=2, cls=NumpyEncoder)
|
|
||||||
return filepath
|
|
||||||
21
start_api.py
Normal file
21
start_api.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
"""
|
||||||
|
Start the FastAPI server
|
||||||
|
Run: python start_api.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import uvicorn
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("🚀 Starting Nordic Privacy AI API Server...")
|
||||||
|
print("📍 API will be available at: http://localhost:8000")
|
||||||
|
print("📖 Interactive docs at: http://localhost:8000/docs")
|
||||||
|
print("🔗 Frontend should run at: http://localhost:3000")
|
||||||
|
print("\nPress CTRL+C to stop\n")
|
||||||
|
|
||||||
|
uvicorn.run(
|
||||||
|
"api.main:app",
|
||||||
|
host="0.0.0.0",
|
||||||
|
port=8000,
|
||||||
|
reload=True, # Auto-reload on code changes
|
||||||
|
log_level="info"
|
||||||
|
)
|
||||||
@@ -10,7 +10,7 @@ import os
|
|||||||
# Add parent directory to path
|
# Add parent directory to path
|
||||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
|
||||||
from cleaning import DataCleaner, CleaningConfig
|
from data_cleaning import DataCleaner
|
||||||
|
|
||||||
|
|
||||||
def test_basic_cleaning():
|
def test_basic_cleaning():
|
||||||
|
|||||||
Reference in New Issue
Block a user