diff --git a/.gitignore b/.gitignore index 44cf1b7..7f76539 100644 --- a/.gitignore +++ b/.gitignore @@ -11,7 +11,7 @@ dist/ downloads/ eggs/ .eggs/ -lib/ + lib64/ parts/ sdist/ @@ -48,9 +48,11 @@ Thumbs.db # Streamlit .streamlit/secrets.toml -# Reports +# Reports and generated files reports/*.json reports/*.pdf +reports/*.csv +reports/*.html # Logs *.log @@ -59,4 +61,17 @@ reports/*.pdf *.csv !Datasets/loan_data.csv +# Node.js & Next.js +node_modules/ +.next/ +out/ +.vercel +*.tsbuildinfo +next-env.d.ts + +# Frontend build artifacts +frontend/nordic-privacy-ai/.next/ +frontend/nordic-privacy-ai/out/ +frontend/nordic-privacy-ai/node_modules/ + Data \ No newline at end of file diff --git a/README.md b/README.md index 8b4af0a..701a09c 100644 --- a/README.md +++ b/README.md @@ -1,39 +1,1029 @@ -# AI Governance Module +# Nordic Privacy AI πŸ›‘οΈ# Nordic Privacy AI πŸ›‘οΈ# AI Governance Module -A Python package for detecting bias and analyzing risks in machine learning models. Provides comprehensive fairness metrics, privacy risk assessment, and ethical AI evaluation. -## Features -### 🎯 Bias Detection -- **Fairness Metrics**: Disparate Impact, Statistical Parity Difference, Equal Opportunity Difference -- **Demographic Analysis**: Group-wise performance evaluation -- **Violation Detection**: Automatic flagging with severity levels +**AI-Powered GDPR Compliance & Privacy Protection Platform** -### πŸ›‘οΈ Risk Assessment -- **Privacy Risks**: PII detection, GDPR compliance, data exposure analysis -- **Ethical Risks**: Fairness, transparency, accountability, social impact -- **Compliance Risks**: Regulatory adherence (GDPR, CCPA, AI Act) -- **Data Quality**: Missing data, class imbalance, outlier detection -### πŸ€– Machine Learning -- Generalized classification model (works with any dataset) -- Auto-detection of feature types and protected attributes -- Comprehensive performance metrics + +A comprehensive solution for AI governance, bias detection, risk assessment, and automated PII cleaning with GDPR compliance. Built for Nordic ecosystems and beyond.**AI-Powered GDPR Compliance & Privacy Protection Platform**A Python package for detecting bias and analyzing risks in machine learning models. Provides comprehensive fairness metrics, privacy risk assessment, and ethical AI evaluation. + + + +[![Python](https://img.shields.io/badge/Python-3.8+-blue.svg)](https://www.python.org/) + +[![FastAPI](https://img.shields.io/badge/FastAPI-0.109+-green.svg)](https://fastapi.tiangolo.com/) + +[![Next.js](https://img.shields.io/badge/Next.js-14.2+-black.svg)](https://nextjs.org/)A comprehensive solution for AI governance, bias detection, risk assessment, and automated PII cleaning with GDPR compliance. Built for Nordic ecosystems and beyond.## Features + +[![License](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE) + + + +--- + +[![Python](https://img.shields.io/badge/Python-3.8+-blue.svg)](https://www.python.org/)### 🎯 Bias Detection + +## πŸš€ Quick Start + +[![FastAPI](https://img.shields.io/badge/FastAPI-0.109+-green.svg)](https://fastapi.tiangolo.com/)- **Fairness Metrics**: Disparate Impact, Statistical Parity Difference, Equal Opportunity Difference + +### Prerequisites + +- Python 3.8+[![Next.js](https://img.shields.io/badge/Next.js-14.2+-black.svg)](https://nextjs.org/)- **Demographic Analysis**: Group-wise performance evaluation + +- Node.js 18+ + +- GPU (optional, for faster processing)[![License](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)- **Violation Detection**: Automatic flagging with severity levels + + + +### Installation + + + +1. **Clone the repository**---### πŸ›‘οΈ Risk Assessment + +```powershell + +git clone https://github.com/PlatypusPus/MushroomEmpire.git- **Privacy Risks**: PII detection, GDPR compliance, data exposure analysis + +cd MushroomEmpire + +```## πŸš€ Quick Start- **Ethical Risks**: Fairness, transparency, accountability, social impact + + + +2. **Install Python dependencies**- **Compliance Risks**: Regulatory adherence (GDPR, CCPA, AI Act) + +```powershell + +pip install -r requirements.txt### Prerequisites- **Data Quality**: Missing data, class imbalance, outlier detection + +python -m spacy download en_core_web_sm + +```- Python 3.8+ + + + +3. **Install frontend dependencies**- Node.js 18+### πŸ€– Machine Learning + +```powershell + +cd frontend- GPU (optional, for faster processing)- Generalized classification model (works with any dataset) + +npm install + +cd ..- Auto-detection of feature types and protected attributes + +``` + +### Installation- Comprehensive performance metrics + +### Running the Application + - Feature importance analysis -## Installation +1. **Start the FastAPI backend** (Terminal 1) + +```powershell1. **Clone the repository** + +python start_api.py + +``````powershell## Installation + +Backend runs at: **http://localhost:8000** + +git clone https://github.com/PlatypusPus/MushroomEmpire.git + +2. **Start the Next.js frontend** (Terminal 2) + +```powershellcd MushroomEmpire```bash + +cd frontend + +npm run dev```pip install -r requirements.txt -```bash -pip install -r requirements.txt ``` -Or install as a package: +Frontend runs at: **http://localhost:3000**``` + + + +3. **Access the application**2. **Install Python dependencies** + + - Frontend UI: http://localhost:3000 + + - Try It Page: http://localhost:3000/try```powershellOr install as a package: + + - API Documentation: http://localhost:8000/docs + + - Health Check: http://localhost:8000/healthpip install -r requirements.txt + + + +---python -m spacy download en_core_web_sm```bash + + + +## πŸ“‹ Features```pip install -e . + + + +### 🎯 AI Governance & Bias Detection``` + +- **Fairness Metrics**: Disparate Impact, Statistical Parity, Equal Opportunity + +- **Demographic Analysis**: Group-wise performance evaluation3. **Install frontend dependencies** + +- **Violation Detection**: Automatic flagging with severity levels (HIGH/MEDIUM/LOW) + +- **Model Performance**: Comprehensive ML metrics (accuracy, precision, recall, F1)```powershell## Quick Start + + + +### πŸ›‘οΈ Privacy Risk Assessmentcd frontend/nordic-privacy-ai + +- **Privacy Risks**: PII detection, GDPR compliance scoring, data exposure analysis + +- **Ethical Risks**: Fairness, transparency, accountability evaluationnpm install```python + +- **Compliance Risks**: Regulatory adherence (GDPR, CCPA, AI Act) + +- **Data Quality**: Missing data, class imbalance, outlier detectioncd ../..from ai_governance import AIGovernanceAnalyzer + + + +### 🧹 Automated Data Cleaning``` + +- **PII Detection**: Email, phone, SSN, credit cards, IP addresses, and more + +- **GPU Acceleration**: CUDA-enabled for 10x faster processing# Initialize analyzer + +- **GDPR Compliance**: Automatic anonymization with audit trails + +- **Smart Anonymization**: Context-aware masking and pseudonymization### Running the Applicationanalyzer = AIGovernanceAnalyzer() + + + +### 🌐 Modern Web Interface + +- **Drag & Drop Upload**: Intuitive CSV file handling + +- **Real-time Processing**: Live feedback and progress tracking1. **Start the FastAPI backend** (Terminal 1)# Run complete analysis + +- **Interactive Dashboards**: Visualize bias metrics, risk scores, and results + +- **Report Downloads**: JSON reports, cleaned CSV, and audit logs```powershellreport = analyzer.analyze( + + + +---python start_api.py data_path='your_data.csv', + + + +## πŸ—οΈ Project Structure``` target_column='target', + + + +```Backend runs at: **http://localhost:8000** protected_attributes=['gender', 'age', 'race'] + +MushroomEmpire/ + +β”œβ”€β”€ api/ # FastAPI Backend) + +β”‚ β”œβ”€β”€ main.py # Application entry point + +β”‚ β”œβ”€β”€ routers/2. **Start the Next.js frontend** (Terminal 2) + +β”‚ β”‚ β”œβ”€β”€ analyze.py # POST /api/analyze - AI Governance + +β”‚ β”‚ └── clean.py # POST /api/clean - Data Cleaning```powershell# Access results + +β”‚ └── utils/ # Helper utilities + +β”‚cd frontend/nordic-privacy-aiprint(f"Bias Score: {report['summary']['overall_bias_score']:.3f}") + +β”œβ”€β”€ ai_governance/ # Core AI Governance Module + +β”‚ β”œβ”€β”€ __init__.py # AIGovernanceAnalyzer classnpm run devprint(f"Risk Level: {report['summary']['risk_level']}") + +β”‚ β”œβ”€β”€ data_processor.py # Data preprocessing + +β”‚ β”œβ”€β”€ model_trainer.py # ML model training```print(f"Model Accuracy: {report['summary']['model_accuracy']:.3f}") + +β”‚ β”œβ”€β”€ bias_analyzer.py # Bias detection engine + +β”‚ β”œβ”€β”€ risk_analyzer.py # Risk assessment engineFrontend runs at: **http://localhost:3000** + +β”‚ └── report_generator.py # JSON report generation + +β”‚# Save report + +β”œβ”€β”€ data_cleaning/ # Data Cleaning Module + +β”‚ β”œβ”€β”€ __init__.py # DataCleaner class3. **Access the application**analyzer.save_report(report, 'governance_report.json') + +β”‚ β”œβ”€β”€ cleaner.py # PII detection & anonymization + +β”‚ └── config.py # PII patterns & GDPR rules - Frontend UI: http://localhost:3000``` + +β”‚ + +β”œβ”€β”€ frontend/ # Next.js Frontend - API Documentation: http://localhost:8000/docs + +β”‚ β”œβ”€β”€ app/ # App Router pages + +β”‚ β”‚ β”œβ”€β”€ page.tsx # Landing page - Health Check: http://localhost:8000/health## Module Structure + +β”‚ β”‚ └── try/page.tsx # Try it page (workflow UI) + +β”‚ β”œβ”€β”€ components/ + +β”‚ β”‚ └── try/ + +β”‚ β”‚ β”œβ”€β”€ CenterPanel.tsx # File upload & results---``` + +β”‚ β”‚ β”œβ”€β”€ Sidebar.tsx # Workflow tabs + +β”‚ β”‚ └── ChatbotPanel.tsx # AI assistantai_governance/ + +β”‚ └── lib/ + +β”‚ β”œβ”€β”€ api.ts # TypeScript API client## πŸ“‹ Featuresβ”œβ”€β”€ __init__.py # Main API + +β”‚ └── indexeddb.ts # Browser caching utilities + +β”‚β”œβ”€β”€ data_processor.py # Data preprocessing + +β”œβ”€β”€ Datasets/ # Sample datasets + +β”‚ └── loan_data.csv # Example: Loan approval dataset### 🎯 AI Governance & Bias Detectionβ”œβ”€β”€ model_trainer.py # ML model training + +β”‚ + +β”œβ”€β”€ reports/ # Generated reports (auto-created)- **Fairness Metrics**: Disparate Impact, Statistical Parity, Equal Opportunityβ”œβ”€β”€ bias_analyzer.py # Bias detection + +β”‚ β”œβ”€β”€ governance_report_*.json + +β”‚ β”œβ”€β”€ cleaned_*.csv- **Demographic Analysis**: Group-wise performance evaluationβ”œβ”€β”€ risk_analyzer.py # Risk assessment + +β”‚ └── cleaning_audit_*.json + +β”‚- **Violation Detection**: Automatic flagging with severity levels (HIGH/MEDIUM/LOW)└── report_generator.py # Report generation + +β”œβ”€β”€ start_api.py # Backend startup script + +β”œβ”€β”€ setup.py # Package configuration- **Model Performance**: Comprehensive ML metrics (accuracy, precision, recall, F1)``` + +β”œβ”€β”€ requirements.txt # Python dependencies + +└── README.md # This file -```bash -pip install -e . ``` -## Quick Start +### πŸ›‘οΈ Privacy Risk Assessment## API Reference + +--- + +- **Privacy Risks**: PII detection, GDPR compliance scoring, data exposure analysis + +## πŸ“‘ API Reference + +- **Ethical Risks**: Fairness, transparency, accountability evaluation### AIGovernanceAnalyzer + +### Base URL + +```- **Compliance Risks**: Regulatory adherence (GDPR, CCPA, AI Act) + +http://localhost:8000 + +```- **Data Quality**: Missing data, class imbalance, outlier detectionMain class for running AI governance analysis. + + + +### Endpoints + + + +#### **POST /api/analyze**### 🧹 Automated Data Cleaning```python + +Analyze dataset for bias, fairness, and risk assessment. + +- **PII Detection**: Email, phone, SSN, credit cards, IP addresses, and moreanalyzer = AIGovernanceAnalyzer() + +**Request:** + +```bash- **GPU Acceleration**: CUDA-enabled for 10x faster processing + +curl -X POST "http://localhost:8000/api/analyze" \ + + -F "file=@Datasets/loan_data.csv"- **GDPR Compliance**: Automatic anonymization with audit trails# Analyze from DataFrame + +``` + +- **Smart Anonymization**: Context-aware masking and pseudonymizationreport = analyzer.analyze_dataframe( + +**Response:** + +```json df=dataframe, + +{ + + "status": "success",### 🌐 Modern Web Interface target_column='target', + + "filename": "loan_data.csv", + + "dataset_info": {- **Drag & Drop Upload**: Intuitive CSV file handling protected_attributes=['gender', 'age'] + + "rows": 1000, + + "columns": 15- **Real-time Processing**: Live feedback and progress tracking) + + }, + + "model_performance": {- **Interactive Dashboards**: Visualize bias metrics, risk scores, and results + + "accuracy": 0.85, + + "precision": 0.82,- **Report Downloads**: JSON reports, cleaned CSV, and audit logs# Analyze from file + + "recall": 0.88, + + "f1_score": 0.85report = analyzer.analyze( + + }, + + "bias_metrics": {--- data_path='data.csv', + + "overall_bias_score": 0.23, + + "violations_detected": [] target_column='target', + + }, + + "risk_assessment": {## πŸ—οΈ Project Structure protected_attributes=['gender', 'age'] + + "overall_risk_score": 0.35, + + "privacy_risks": [],) + + "ethical_risks": [] + + },`````` + + "recommendations": [ + + "[HIGH] Privacy: Remove PII columns before deployment",MushroomEmpire/ + + "[MEDIUM] Fairness: Monitor demographic parity over time" + + ],β”œβ”€β”€ api/ # FastAPI Backend### Individual Components + + "report_file": "/reports/governance_report_20251107_123456.json" + +}β”‚ β”œβ”€β”€ main.py # Application entry point + +``` + +β”‚ β”œβ”€β”€ routers/```python + +#### **POST /api/clean** + +Detect and anonymize PII in datasets.β”‚ β”‚ β”œβ”€β”€ analyze.py # POST /api/analyze - AI Governancefrom ai_governance import ( + + + +**Request:**β”‚ β”‚ └── clean.py # POST /api/clean - Data Cleaning DataProcessor, + +```bash + +curl -X POST "http://localhost:8000/api/clean" \β”‚ └── utils/ # Helper utilities GeneralizedModelTrainer, + + -F "file=@Datasets/loan_data.csv" + +```β”‚ BiasAnalyzer, + + + +**Response:**β”œβ”€β”€ ai_governance/ # Core AI Governance Module RiskAnalyzer, + +```json + +{β”‚ β”œβ”€β”€ __init__.py # AIGovernanceAnalyzer class ReportGenerator + + "status": "success", + + "dataset_info": {β”‚ β”œβ”€β”€ data_processor.py # Data preprocessing) + + "original_rows": 1000, + + "original_columns": 15,β”‚ β”œβ”€β”€ model_trainer.py # ML model training + + "cleaned_rows": 1000, + + "cleaned_columns": 13β”‚ β”œβ”€β”€ bias_analyzer.py # Bias detection engine# Process data + + }, + + "summary": {β”‚ β”œβ”€β”€ risk_analyzer.py # Risk assessment engineprocessor = DataProcessor(df) + + "columns_removed": ["ssn", "email"], + + "columns_anonymized": ["phone", "address"],β”‚ └── report_generator.py # JSON report generationprocessor.target_column = 'target' + + "total_cells_affected": 2847 + + },β”‚processor.protected_attributes = ['gender', 'age'] + + "pii_detections": { + + "EMAIL": 1000,β”œβ”€β”€ cleaning.py # Core PII detection & anonymizationprocessor.prepare_data() + + "PHONE": 987, + + "SSN": 1000β”œβ”€β”€ cleaning_config.py # Configuration for data cleaning + + }, + + "gdpr_compliance": [β”œβ”€β”€ test_cleaning.py # Unit tests for cleaning module# Train model + + "Article 5(1)(c) - Data minimization", + + "Article 17 - Right to erasure",β”‚trainer = GeneralizedModelTrainer( + + "Article 25 - Data protection by design" + + ],β”œβ”€β”€ frontend/nordic-privacy-ai/ # Next.js Frontend processor.X_train, + + "files": { + + "cleaned_csv": "/reports/cleaned_20251107_123456.csv",β”‚ β”œβ”€β”€ app/ # App Router pages processor.X_test, + + "audit_report": "/reports/cleaning_audit_20251107_123456.json" + + }β”‚ β”‚ β”œβ”€β”€ page.tsx # Landing page processor.y_train, + +} + +```β”‚ β”‚ └── try/page.tsx # Try it page (workflow UI) processor.y_test, + + + +#### **GET /health**β”‚ β”œβ”€β”€ components/ processor.feature_names + +Health check endpoint with GPU status. + +β”‚ β”‚ └── try/) + +**Response:** + +```jsonβ”‚ β”‚ β”œβ”€β”€ CenterPanel.tsx # File upload & resultstrainer.train() + +{ + + "status": "healthy",β”‚ β”‚ β”œβ”€β”€ Sidebar.tsx # Workflow tabstrainer.evaluate() + + "version": "1.0.0", + + "gpu_available": trueβ”‚ β”‚ └── ChatbotPanel.tsx # AI assistant + +} + +```β”‚ └── lib/# Analyze bias + + + +#### **GET /reports/{filename}**β”‚ β”œβ”€β”€ api.ts # TypeScript API clientbias_analyzer = BiasAnalyzer( + +Download generated reports and cleaned files. + +β”‚ └── indexeddb.ts # Browser caching utilities processor.X_test, + +--- + +β”‚ processor.y_test, + +## πŸ”§ Configuration + +β”œβ”€β”€ Datasets/ # Sample datasets trainer.y_pred, + +### Environment Variables + +β”‚ └── loan_data.csv # Example: Loan approval dataset processor.df, + +Create `.env` file in `frontend/`: + +```envβ”‚ processor.protected_attributes, + +NEXT_PUBLIC_API_URL=http://localhost:8000 + +```β”œβ”€β”€ reports/ # Generated reports (auto-created) processor.target_column + + + +### CORS Configurationβ”‚ β”œβ”€β”€ governance_report_*.json) + + + +Edit `api/main.py` to add production domains:β”‚ β”œβ”€β”€ cleaned_*.csvbias_results = bias_analyzer.analyze() + +```python + +origins = [β”‚ └── cleaning_audit_*.json + + "http://localhost:3000", + + "https://your-production-domain.com"β”‚# Assess risks + +] + +```β”œβ”€β”€ start_api.py # Backend startup scriptrisk_analyzer = RiskAnalyzer( + + + +### GPU Accelerationβ”œβ”€β”€ setup.py # Package configuration processor.df, + + + +GPU is automatically detected and used if available. To force CPU mode:β”œβ”€β”€ requirements.txt # Python dependencies trainer.results, + +```python + +# In data_cleaning/cleaner.py or api endpoints└── README.md # This file bias_results, + +DataCleaner(use_gpu=False) + +`````` processor.protected_attributes, + + + +--- processor.target_column + + + +## πŸ§ͺ Testing---) + + + +### Test the Backendrisk_results = risk_analyzer.analyze() + +```powershell + +# Test analyze endpoint## πŸ“‘ API Reference + +curl -X POST "http://localhost:8000/api/analyze" -F "file=@Datasets/loan_data.csv" + +# Generate report + +# Test clean endpoint + +curl -X POST "http://localhost:8000/api/clean" -F "file=@Datasets/loan_data.csv"### Base URLreport_gen = ReportGenerator( + + + +# Check health``` trainer.results, + +curl http://localhost:8000/health + +```http://localhost:8000 bias_results, + + + +### Run Unit Tests``` risk_results, + +```powershell + +# Test cleaning module processor.df + +python test_cleaning.py + +### Endpoints) + +# Run all tests (if pytest configured) + +pytestreport = report_gen.generate_report() + +``` + +#### **POST /api/analyze**``` + +--- + +Analyze dataset for bias, fairness, and risk assessment. + +## πŸ“Š Usage Examples + +## Report Structure + +### Python SDK Usage + +**Request:** + +```python + +from ai_governance import AIGovernanceAnalyzer```bashThe module generates comprehensive JSON reports: + + + +# Initialize analyzercurl -X POST "http://localhost:8000/api/analyze" \ + +analyzer = AIGovernanceAnalyzer() + + -F "file=@Datasets/loan_data.csv"```json + +# Analyze dataset + +report = analyzer.analyze(```{ + + data_path='Datasets/loan_data.csv', + + target_column='loan_approved', "metadata": { + + protected_attributes=['gender', 'age', 'race'] + +)**Response:** "report_id": "unique_id", + + + +# Print results```json "generated_at": "timestamp", + +print(f"Bias Score: {report['summary']['overall_bias_score']:.3f}") + +print(f"Risk Level: {report['summary']['risk_level']}"){ "dataset_info": {} + +print(f"Model Accuracy: {report['summary']['model_accuracy']:.3f}") + + "status": "success", }, + +# Save report + +analyzer.save_report(report, 'my_report.json') "filename": "loan_data.csv", "summary": { + +``` + + "dataset_info": { "overall_bias_score": 0.0-1.0, + +### Data Cleaning Usage + + "rows": 1000, "overall_risk_score": 0.0-1.0, + +```python + +from data_cleaning import DataCleaner "columns": 15 "risk_level": "LOW|MEDIUM|HIGH", + + + +# Initialize cleaner with GPU }, "model_accuracy": 0.0-1.0, + +cleaner = DataCleaner(use_gpu=True) + + "model_performance": { "fairness_violations_count": 0 + +# Load and clean data + +df = cleaner.load_data('Datasets/loan_data.csv') "accuracy": 0.85, }, + +cleaned_df, audit = cleaner.anonymize_pii(df) + + "precision": 0.82, "model_performance": {}, + +# Save results + +cleaner.save_cleaned_data(cleaned_df, 'cleaned_output.csv') "recall": 0.88, "bias_analysis": {}, + +cleaner.save_audit_report(audit, 'audit_report.json') + +``` "f1_score": 0.85 "risk_assessment": {}, + + + +### Frontend Integration }, "key_findings": [], + + + +```typescript "bias_metrics": { "recommendations": [] + +import { analyzeDataset, cleanDataset } from '@/lib/api'; + + "overall_bias_score": 0.23,} + +// Analyze uploaded file + +const handleAnalyze = async (file: File) => { "violations_detected": []``` + + const result = await analyzeDataset(file); + + console.log('Bias Score:', result.bias_metrics.overall_bias_score); }, + + console.log('Download:', result.report_file); + +}; "risk_assessment": {## Metrics Interpretation + + + +// Clean uploaded file "overall_risk_score": 0.35, + +const handleClean = async (file: File) => { + + const result = await cleanDataset(file); "privacy_risks": [],### Bias Score (0-1, lower is better) + + console.log('Cells anonymized:', result.summary.total_cells_affected); + + console.log('Download cleaned:', result.files.cleaned_csv); "ethical_risks": []- **0.0 - 0.3**: Low bias βœ… + +}; + +``` },- **0.3 - 0.5**: Moderate bias ⚠️ + + + +--- "recommendations": [- **0.5 - 1.0**: High bias ❌ + + + +## πŸ“ˆ Metrics Interpretation "[HIGH] Privacy: Remove PII columns before deployment", + + + +### Bias Score (0-1, lower is better) "[MEDIUM] Fairness: Monitor demographic parity over time"### Risk Score (0-1, lower is better) + +- **0.0 - 0.3**: βœ… Low bias - Good fairness + +- **0.3 - 0.5**: ⚠️ Moderate bias - Monitoring recommended ],- **0.0 - 0.4**: LOW risk βœ… + +- **0.5 - 1.0**: ❌ High bias - Immediate action required + + "report_file": "/reports/governance_report_20251107_123456.json"- **0.4 - 0.7**: MEDIUM risk ⚠️ + +### Risk Score (0-1, lower is better) + +- **0.0 - 0.4**: βœ… LOW risk}- **0.7 - 1.0**: HIGH risk ❌ + +- **0.4 - 0.7**: ⚠️ MEDIUM risk + +- **0.7 - 1.0**: ❌ HIGH risk``` + + + +### Fairness Metrics### Fairness Metrics + +- **Disparate Impact**: Fair range 0.8 - 1.25 + +- **Statistical Parity**: Fair threshold < 0.1#### **POST /api/clean**- **Disparate Impact**: Fair range 0.8 - 1.25 + +- **Equal Opportunity**: Fair threshold < 0.1 + +Detect and anonymize PII in datasets.- **Statistical Parity**: Fair threshold < 0.1 + +--- + +- **Equal Opportunity**: Fair threshold < 0.1 + +## πŸ› οΈ Technology Stack + +**Request:** + +### Backend + +- **FastAPI** - Modern Python web framework```bash## Requirements + +- **scikit-learn** - Machine learning + +- **spaCy** - NLP for PII detectioncurl -X POST "http://localhost:8000/api/clean" \ + +- **PyTorch** - GPU acceleration (optional) + +- **pandas** - Data processing -F "file=@Datasets/loan_data.csv"- Python 3.8+ + + + +### Frontend```- pandas >= 2.0.0 + +- **Next.js 14** - React framework with App Router + +- **TypeScript** - Type safety- numpy >= 1.24.0 + +- **Tailwind CSS** - Styling + +- **IndexedDB** - Browser storage**Response:**- scikit-learn >= 1.3.0 + + + +---```json + + + +## 🀝 Contributing{See `requirements.txt` for complete list. + + + +Contributions are welcome! Please follow these steps: "status": "success", + + + +1. Fork the repository "dataset_info": {## Integration Examples + +2. Create a feature branch (`git checkout -b feature/AmazingFeature`) + +3. Commit your changes (`git commit -m 'Add some AmazingFeature'`) "original_rows": 1000, + +4. Push to the branch (`git push origin feature/AmazingFeature`) + +5. Open a Pull Request "original_columns": 15,### FastAPI Backend + + + +--- "cleaned_rows": 1000, + + + +## πŸ“ License "cleaned_columns": 13```python + + + +This project is licensed under the MIT License - see the LICENSE file for details. },from fastapi import FastAPI, UploadFile + + + +--- "summary": {from ai_governance import AIGovernanceAnalyzer + + + +## πŸŽ“ Citation "columns_removed": ["ssn", "email"], + + + +If you use this project in your research or work, please cite: "columns_anonymized": ["phone", "address"],app = FastAPI() + + + +```bibtex "total_cells_affected": 2847analyzer = AIGovernanceAnalyzer() + +@software{nordic_privacy_ai, + + title = {Nordic Privacy AI - GDPR Compliance & AI Governance Platform}, }, + + author = {PlatypusPus}, + + year = {2025}, "pii_detections": {@app.post("/analyze") + + url = {https://github.com/PlatypusPus/MushroomEmpire} + +} "EMAIL": 1000,async def analyze(file: UploadFile, target: str, protected: list): + +``` + + "PHONE": 987, df = pd.read_csv(file.file) + +--- + + "SSN": 1000 report = analyzer.analyze_dataframe(df, target, protected) + +## πŸ“§ Support + + }, return report + +- **Issues**: [GitHub Issues](https://github.com/PlatypusPus/MushroomEmpire/issues) + +- **Discussions**: [GitHub Discussions](https://github.com/PlatypusPus/MushroomEmpire/discussions) "gdpr_compliance": [``` + + + +--- "Article 5(1)(c) - Data minimization", + + + +## πŸ™ Acknowledgments "Article 17 - Right to erasure",### Flask Backend + + + +- Built for Nordic ecosystems (BankID, MitID, Suomi.fi) "Article 25 - Data protection by design" + +- Inspired by GDPR, CCPA, and EU AI Act requirements + +- Developed during a hackathon prototype ],```python + + + +--- "files": {from flask import Flask, request, jsonify + + + +**Made with ❀️ by the Nordic Privacy AI Team** "cleaned_csv": "/reports/cleaned_20251107_123456.csv",from ai_governance import AIGovernanceAnalyzer + + + "audit_report": "/reports/cleaning_audit_20251107_123456.json" + + }app = Flask(__name__) + +}analyzer = AIGovernanceAnalyzer() + +``` + +@app.route('/analyze', methods=['POST']) + +#### **GET /health**def analyze(): + +Health check endpoint with GPU status. file = request.files['file'] + + df = pd.read_csv(file) + +**Response:** report = analyzer.analyze_dataframe( + +```json df, + +{ request.form['target'], + + "status": "healthy", request.form.getlist('protected') + + "version": "1.0.0", ) + + "gpu_available": true return jsonify(report) + +}``` + +``` + +## License + +#### **GET /reports/{filename}** + +Download generated reports and cleaned files.MIT License + + + +---## Contributing + + + +## πŸ”§ ConfigurationContributions welcome! Please open an issue or submit a pull request. + + + +### Environment Variables## Citation + + + +Create `.env` file in `frontend/nordic-privacy-ai/`:If you use this module in your research or project, please cite: + +```env + +NEXT_PUBLIC_API_URL=http://localhost:8000``` + +```AI Governance Module - Bias Detection and Risk Analysis + +https://github.com/PlatypusPus/MushroomEmpire + +### CORS Configuration``` + + +Edit `api/main.py` to add production domains: +```python +origins = [ + "http://localhost:3000", + "https://your-production-domain.com" +] +``` + +### GPU Acceleration + +GPU is automatically detected and used if available. To force CPU mode: +```python +# In cleaning.py or api endpoints +DataCleaner(use_gpu=False) +``` + +--- + +## πŸ§ͺ Testing + +### Test the Backend +```powershell +# Test analyze endpoint +curl -X POST "http://localhost:8000/api/analyze" -F "file=@Datasets/loan_data.csv" + +# Test clean endpoint +curl -X POST "http://localhost:8000/api/clean" -F "file=@Datasets/loan_data.csv" + +# Check health +curl http://localhost:8000/health +``` + +### Run Unit Tests +```powershell +# Test cleaning module +python test_cleaning.py + +# Run all tests (if pytest configured) +pytest +``` + +--- + +## πŸ“Š Usage Examples + +### Python SDK Usage ```python from ai_governance import AIGovernanceAnalyzer @@ -41,221 +1031,143 @@ from ai_governance import AIGovernanceAnalyzer # Initialize analyzer analyzer = AIGovernanceAnalyzer() -# Run complete analysis +# Analyze dataset report = analyzer.analyze( - data_path='your_data.csv', - target_column='target', + data_path='Datasets/loan_data.csv', + target_column='loan_approved', protected_attributes=['gender', 'age', 'race'] ) -# Access results +# Print results print(f"Bias Score: {report['summary']['overall_bias_score']:.3f}") print(f"Risk Level: {report['summary']['risk_level']}") print(f"Model Accuracy: {report['summary']['model_accuracy']:.3f}") # Save report -analyzer.save_report(report, 'governance_report.json') +analyzer.save_report(report, 'my_report.json') ``` -## Module Structure - -``` -ai_governance/ -β”œβ”€β”€ __init__.py # Main API -β”œβ”€β”€ data_processor.py # Data preprocessing -β”œβ”€β”€ model_trainer.py # ML model training -β”œβ”€β”€ bias_analyzer.py # Bias detection -β”œβ”€β”€ risk_analyzer.py # Risk assessment -└── report_generator.py # Report generation -``` - -## API Reference - -### AIGovernanceAnalyzer - -Main class for running AI governance analysis. +### Data Cleaning Usage ```python -analyzer = AIGovernanceAnalyzer() +from cleaning import DataCleaner -# Analyze from DataFrame -report = analyzer.analyze_dataframe( - df=dataframe, - target_column='target', - protected_attributes=['gender', 'age'] -) +# Initialize cleaner with GPU +cleaner = DataCleaner(use_gpu=True) -# Analyze from file -report = analyzer.analyze( - data_path='data.csv', - target_column='target', - protected_attributes=['gender', 'age'] -) +# Load and clean data +df = cleaner.load_data('Datasets/loan_data.csv') +cleaned_df, audit = cleaner.anonymize_pii(df) + +# Save results +cleaner.save_cleaned_data(cleaned_df, 'cleaned_output.csv') +cleaner.save_audit_report(audit, 'audit_report.json') ``` -### Individual Components +### Frontend Integration -```python -from ai_governance import ( - DataProcessor, - GeneralizedModelTrainer, - BiasAnalyzer, - RiskAnalyzer, - ReportGenerator -) +```typescript +import { analyzeDataset, cleanDataset } from '@/lib/api'; -# Process data -processor = DataProcessor(df) -processor.target_column = 'target' -processor.protected_attributes = ['gender', 'age'] -processor.prepare_data() +// Analyze uploaded file +const handleAnalyze = async (file: File) => { + const result = await analyzeDataset(file); + console.log('Bias Score:', result.bias_metrics.overall_bias_score); + console.log('Download:', result.report_file); +}; -# Train model -trainer = GeneralizedModelTrainer( - processor.X_train, - processor.X_test, - processor.y_train, - processor.y_test, - processor.feature_names -) -trainer.train() -trainer.evaluate() - -# Analyze bias -bias_analyzer = BiasAnalyzer( - processor.X_test, - processor.y_test, - trainer.y_pred, - processor.df, - processor.protected_attributes, - processor.target_column -) -bias_results = bias_analyzer.analyze() - -# Assess risks -risk_analyzer = RiskAnalyzer( - processor.df, - trainer.results, - bias_results, - processor.protected_attributes, - processor.target_column -) -risk_results = risk_analyzer.analyze() - -# Generate report -report_gen = ReportGenerator( - trainer.results, - bias_results, - risk_results, - processor.df -) -report = report_gen.generate_report() +// Clean uploaded file +const handleClean = async (file: File) => { + const result = await cleanDataset(file); + console.log('Cells anonymized:', result.summary.total_cells_affected); + console.log('Download cleaned:', result.files.cleaned_csv); +}; ``` -## Report Structure +--- -The module generates comprehensive JSON reports: - -```json -{ - "metadata": { - "report_id": "unique_id", - "generated_at": "timestamp", - "dataset_info": {} - }, - "summary": { - "overall_bias_score": 0.0-1.0, - "overall_risk_score": 0.0-1.0, - "risk_level": "LOW|MEDIUM|HIGH", - "model_accuracy": 0.0-1.0, - "fairness_violations_count": 0 - }, - "model_performance": {}, - "bias_analysis": {}, - "risk_assessment": {}, - "key_findings": [], - "recommendations": [] -} -``` - -## Metrics Interpretation +## πŸ“ˆ Metrics Interpretation ### Bias Score (0-1, lower is better) -- **0.0 - 0.3**: Low bias βœ… -- **0.3 - 0.5**: Moderate bias ⚠️ -- **0.5 - 1.0**: High bias ❌ +- **0.0 - 0.3**: βœ… Low bias - Good fairness +- **0.3 - 0.5**: ⚠️ Moderate bias - Monitoring recommended +- **0.5 - 1.0**: ❌ High bias - Immediate action required ### Risk Score (0-1, lower is better) -- **0.0 - 0.4**: LOW risk βœ… -- **0.4 - 0.7**: MEDIUM risk ⚠️ -- **0.7 - 1.0**: HIGH risk ❌ +- **0.0 - 0.4**: βœ… LOW risk +- **0.4 - 0.7**: ⚠️ MEDIUM risk +- **0.7 - 1.0**: ❌ HIGH risk ### Fairness Metrics - **Disparate Impact**: Fair range 0.8 - 1.25 - **Statistical Parity**: Fair threshold < 0.1 - **Equal Opportunity**: Fair threshold < 0.1 -## Requirements +--- -- Python 3.8+ -- pandas >= 2.0.0 -- numpy >= 1.24.0 -- scikit-learn >= 1.3.0 +## πŸ› οΈ Technology Stack -See `requirements.txt` for complete list. +### Backend +- **FastAPI** - Modern Python web framework +- **scikit-learn** - Machine learning +- **spaCy** - NLP for PII detection +- **PyTorch** - GPU acceleration (optional) +- **pandas** - Data processing -## Integration Examples +### Frontend +- **Next.js 14** - React framework with App Router +- **TypeScript** - Type safety +- **Tailwind CSS** - Styling +- **IndexedDB** - Browser storage -### FastAPI Backend +--- -```python -from fastapi import FastAPI, UploadFile -from ai_governance import AIGovernanceAnalyzer +## 🀝 Contributing -app = FastAPI() -analyzer = AIGovernanceAnalyzer() +Contributions are welcome! Please follow these steps: -@app.post("/analyze") -async def analyze(file: UploadFile, target: str, protected: list): - df = pd.read_csv(file.file) - report = analyzer.analyze_dataframe(df, target, protected) - return report +1. Fork the repository +2. Create a feature branch (`git checkout -b feature/AmazingFeature`) +3. Commit your changes (`git commit -m 'Add some AmazingFeature'`) +4. Push to the branch (`git push origin feature/AmazingFeature`) +5. Open a Pull Request + +--- + +## πŸ“ License + +This project is licensed under the MIT License - see the LICENSE file for details. + +--- + +## πŸŽ“ Citation + +If you use this project in your research or work, please cite: + +```bibtex +@software{nordic_privacy_ai, + title = {Nordic Privacy AI - GDPR Compliance & AI Governance Platform}, + author = {PlatypusPus}, + year = {2025}, + url = {https://github.com/PlatypusPus/MushroomEmpire} +} ``` -### Flask Backend +--- -```python -from flask import Flask, request, jsonify -from ai_governance import AIGovernanceAnalyzer +## πŸ“§ Support -app = Flask(__name__) -analyzer = AIGovernanceAnalyzer() +- **Issues**: [GitHub Issues](https://github.com/PlatypusPus/MushroomEmpire/issues) +- **Discussions**: [GitHub Discussions](https://github.com/PlatypusPus/MushroomEmpire/discussions) -@app.route('/analyze', methods=['POST']) -def analyze(): - file = request.files['file'] - df = pd.read_csv(file) - report = analyzer.analyze_dataframe( - df, - request.form['target'], - request.form.getlist('protected') - ) - return jsonify(report) -``` +--- -## License +## πŸ™ Acknowledgments -MIT License +- Built for Nordic ecosystems (BankID, MitID, Suomi.fi) +- Inspired by GDPR, CCPA, and EU AI Act requirements +- Developed during a hackathon prototype -## Contributing +--- -Contributions welcome! Please open an issue or submit a pull request. - -## Citation - -If you use this module in your research or project, please cite: - -``` -AI Governance Module - Bias Detection and Risk Analysis -https://github.com/PlatypusPus/MushroomEmpire -``` +**Made with ❀️ by the Nordic Privacy AI Team** diff --git a/ai_governance/data_processor.py b/ai_governance/data_processor.py index 01c70bb..e7004df 100644 --- a/ai_governance/data_processor.py +++ b/ai_governance/data_processor.py @@ -21,6 +21,7 @@ class DataProcessor: self.categorical_features = [] self.feature_names = [] self.encoders = {} + self.target_encoder = None # Add target encoder self.scaler = StandardScaler() self.X_train = None @@ -75,6 +76,13 @@ class DataProcessor: X = self.df[feature_cols].copy() y = self.df[self.target_column].copy() + # Encode target variable if it's categorical + if y.dtype == 'object' or y.dtype.name == 'category': + self.target_encoder = LabelEncoder() + y_encoded = self.target_encoder.fit_transform(y) + y = pd.Series(y_encoded, index=y.index) + print(f"Target '{self.target_column}' encoded: {dict(enumerate(self.target_encoder.classes_))}") + # Encode categorical variables for col in self.categorical_features: if col in X.columns: diff --git a/api/__init__.py b/api/__init__.py new file mode 100644 index 0000000..f88a663 --- /dev/null +++ b/api/__init__.py @@ -0,0 +1 @@ +# Empty __init__.py files for Python package structure diff --git a/api/main.py b/api/main.py new file mode 100644 index 0000000..7904c89 --- /dev/null +++ b/api/main.py @@ -0,0 +1,72 @@ +""" +FastAPI Backend for Nordic Privacy AI +Provides endpoints for AI Governance analysis and data cleaning +""" + +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from fastapi.staticfiles import StaticFiles +import os + +from api.routers import analyze, clean + +# Create FastAPI app +app = FastAPI( + title="Nordic Privacy AI API", + description="AI-powered GDPR compliance, bias detection, and risk analysis", + version="1.0.0" +) + +# CORS configuration for Next.js frontend +app.add_middleware( + CORSMiddleware, + allow_origins=[ + "http://localhost:3000", # Next.js dev server + "http://127.0.0.1:3000", + ], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Mount reports directory for file downloads +reports_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "reports") +os.makedirs(reports_dir, exist_ok=True) +app.mount("/reports", StaticFiles(directory=reports_dir), name="reports") + +# Include routers +app.include_router(analyze.router, prefix="/api", tags=["AI Governance"]) +app.include_router(clean.router, prefix="/api", tags=["Data Cleaning"]) + +@app.get("/") +async def root(): + """Health check endpoint""" + return { + "status": "online", + "service": "Nordic Privacy AI API", + "version": "1.0.0", + "endpoints": { + "analyze": "/api/analyze", + "clean": "/api/clean", + "docs": "/docs" + } + } + +@app.get("/health") +async def health_check(): + """Detailed health check""" + try: + import torch + cuda_available = torch.cuda.is_available() + gpu_name = torch.cuda.get_device_name(0) if cuda_available else None + except: + cuda_available = False + gpu_name = None + + return { + "status": "healthy", + "gpu_acceleration": { + "available": cuda_available, + "device": gpu_name or "CPU" + } + } diff --git a/api/routers/__init__.py b/api/routers/__init__.py new file mode 100644 index 0000000..f88a663 --- /dev/null +++ b/api/routers/__init__.py @@ -0,0 +1 @@ +# Empty __init__.py files for Python package structure diff --git a/api/routers/analyze.py b/api/routers/analyze.py new file mode 100644 index 0000000..f87a5e6 --- /dev/null +++ b/api/routers/analyze.py @@ -0,0 +1,141 @@ +""" +AI Governance Analysis Router +Handles bias detection and risk analysis endpoints +""" + +from fastapi import APIRouter, File, UploadFile, HTTPException +from fastapi.responses import JSONResponse +import pandas as pd +import numpy as np +import io +import os +import json +from datetime import datetime +from typing import Dict, Any + +# Import AI Governance modules +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) +from ai_governance import AIGovernanceAnalyzer + +router = APIRouter() + + +def convert_to_serializable(obj): + """Convert numpy/pandas types to native Python types for JSON serialization""" + if isinstance(obj, (np.integer, np.int64, np.int32)): + return int(obj) + elif isinstance(obj, (np.floating, np.float64, np.float32)): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + elif isinstance(obj, dict): + return {key: convert_to_serializable(value) for key, value in obj.items()} + elif isinstance(obj, list): + return [convert_to_serializable(item) for item in obj] + return obj + +@router.post("/analyze") +async def analyze_dataset(file: UploadFile = File(...)): + """ + Analyze uploaded dataset for bias and risk + + - **file**: CSV file to analyze + + Returns: + - Analysis results (bias metrics, risk assessment) + - Report file path for download + """ + + # Validate file type + if not file.filename.endswith('.csv'): + raise HTTPException(status_code=400, detail="Only CSV files are supported") + + try: + # Read uploaded file + contents = await file.read() + df = pd.read_csv(io.BytesIO(contents)) + + if df.empty: + raise HTTPException(status_code=400, detail="Uploaded file is empty") + + # Initialize AI Governance Analyzer + analyzer = AIGovernanceAnalyzer() + + # Auto-detect target column and protected attributes + # Target: Last column (common convention) or first binary/categorical column + target_column = df.columns[-1] + + # Protected attributes: Common sensitive columns + protected_keywords = ['gender', 'age', 'race', 'sex', 'ethnicity', 'religion', 'nationality'] + protected_attributes = [col for col in df.columns + if any(keyword in col.lower() for keyword in protected_keywords)] + + # If no protected attributes found, use first few categorical columns + if not protected_attributes: + categorical_cols = df.select_dtypes(include=['object', 'category']).columns + protected_attributes = [col for col in categorical_cols if col != target_column][:3] + + print(f"Analyzing dataset: {file.filename} ({len(df)} rows, {len(df.columns)} columns)") + print(f"Target column: {target_column}") + print(f"Protected attributes: {protected_attributes}") + + # Run analysis + report = analyzer.analyze_dataframe(df, target_column, protected_attributes) + + # Generate report filename + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + safe_filename = (file.filename or "dataset").replace('.csv', '') + report_filename = f"governance_report_{safe_filename}_{timestamp}.json" + report_path = os.path.join("reports", report_filename) + + # Save full report to disk + full_report_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(__file__))), + report_path + ) + analyzer.save_report(report, full_report_path) + + # Prepare response with summary + response_data = { + "status": "success", + "filename": file.filename, + "dataset_info": { + "rows": len(df), + "columns": len(df.columns), + "features": list(df.columns) + }, + "model_performance": { + "accuracy": report.get("model_metrics", {}).get("accuracy", 0), + "precision": report.get("model_metrics", {}).get("precision", 0), + "recall": report.get("model_metrics", {}).get("recall", 0), + "f1_score": report.get("model_metrics", {}).get("f1_score", 0) + }, + "bias_metrics": { + "overall_bias_score": report.get("bias_metrics", {}).get("overall_bias_score", 0), + "disparate_impact": report.get("bias_metrics", {}).get("disparate_impact", {}), + "statistical_parity": report.get("bias_metrics", {}).get("statistical_parity_difference", {}), + "violations_detected": report.get("bias_metrics", {}).get("fairness_violations", []) + }, + "risk_assessment": { + "overall_risk_score": report.get("risk_metrics", {}).get("overall_risk_score", 0), + "privacy_risks": report.get("risk_metrics", {}).get("privacy_risks", []), + "ethical_risks": report.get("risk_metrics", {}).get("ethical_risks", []), + "compliance_risks": report.get("risk_metrics", {}).get("compliance_risks", []), + "data_quality_risks": report.get("risk_metrics", {}).get("data_quality_risks", []) + }, + "recommendations": report.get("recommendations", []), + "report_file": f"/{report_path}", + "timestamp": datetime.now().isoformat() + } + + # Convert all numpy/pandas types to native Python types + response_data = convert_to_serializable(response_data) + + return JSONResponse(content=response_data) + + except pd.errors.EmptyDataError: + raise HTTPException(status_code=400, detail="File is empty or invalid CSV format") + except Exception as e: + print(f"Error during analysis: {str(e)}") + raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}") diff --git a/api/routers/clean.py b/api/routers/clean.py new file mode 100644 index 0000000..0a5eb68 --- /dev/null +++ b/api/routers/clean.py @@ -0,0 +1,142 @@ +""" +Data Cleaning Router +Handles PII detection and anonymization endpoints +""" + +from fastapi import APIRouter, File, UploadFile, HTTPException +from fastapi.responses import JSONResponse +import pandas as pd +import numpy as np +import io +import os +from datetime import datetime +from typing import Dict, Any + +# Import cleaning module +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) +from data_cleaning import DataCleaner + +router = APIRouter() + + +def convert_to_serializable(obj): + """Convert numpy/pandas types to native Python types for JSON serialization""" + if isinstance(obj, (np.integer, np.int64, np.int32)): + return int(obj) + elif isinstance(obj, (np.floating, np.float64, np.float32)): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + elif isinstance(obj, dict): + return {key: convert_to_serializable(value) for key, value in obj.items()} + elif isinstance(obj, list): + return [convert_to_serializable(item) for item in obj] + return obj + + +@router.post("/clean") +async def clean_dataset(file: UploadFile = File(...)): + """ + Clean uploaded dataset - detect and anonymize PII + + - **file**: CSV file to clean + + Returns: + - Cleaned dataset statistics + - PII detections and anonymization actions + - Report file path for download + - Cleaned CSV file path for download + """ + + # Validate file type + if not file.filename.endswith('.csv'): + raise HTTPException(status_code=400, detail="Only CSV files are supported") + + try: + # Read uploaded file + contents = await file.read() + df = pd.read_csv(io.BytesIO(contents)) + + if df.empty: + raise HTTPException(status_code=400, detail="Uploaded file is empty") + + # Initialize Data Cleaner (with GPU if available) + print(f"Cleaning dataset: {file.filename} ({len(df)} rows, {len(df.columns)} columns)") + cleaner = DataCleaner(df, use_gpu=True) + + # Run cleaning (non-interactive mode for API) + cleaned_df, audit_report = cleaner.clean( + risky_features=None, # Auto-detect + interactive=False, # No user prompts in API mode + scan_all_cells=True + ) + + # Generate filenames + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + safe_filename = file.filename.replace('.csv', '') + + # Save cleaned CSV + cleaned_csv_filename = f"cleaned_{safe_filename}_{timestamp}.csv" + cleaned_csv_path = os.path.join("reports", cleaned_csv_filename) + full_cleaned_csv_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(__file__))), + cleaned_csv_path + ) + cleaner.save_cleaned_data(cleaned_df, full_cleaned_csv_path) + + # Save audit report + audit_report_filename = f"cleaning_audit_{safe_filename}_{timestamp}.json" + audit_report_path = os.path.join("reports", audit_report_filename) + full_audit_report_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(__file__))), + audit_report_path + ) + cleaner.save_audit_report(audit_report, full_audit_report_path) + + # Prepare response + response_data = { + "status": "success", + "filename": file.filename, + "dataset_info": { + "original_rows": int(audit_report["metadata"]["original_rows"]), + "original_columns": int(audit_report["metadata"]["original_columns"]), + "cleaned_rows": int(audit_report["metadata"]["cleaned_rows"]), + "cleaned_columns": int(audit_report["metadata"]["cleaned_columns"]) + }, + "gpu_acceleration": audit_report["metadata"].get("gpu_acceleration", { + "enabled": False, + "device": "CPU" + }), + "summary": { + "columns_removed": audit_report["summary"]["columns_removed"], + "columns_anonymized": audit_report["summary"]["columns_anonymized"], + "total_cells_affected": int(audit_report["summary"]["total_cells_affected"]) + }, + "pii_detections": { + col: { + "action": details["action"], + "entity_types": details["entity_types_found"], + "num_affected_rows": int(details.get("num_affected_rows", 0)), + "examples": details.get("examples", [])[:2] # Show 2 examples + } + for col, details in audit_report["details"].items() + }, + "gdpr_compliance": audit_report["compliance"]["gdpr_articles_applied"], + "files": { + "cleaned_csv": f"/{cleaned_csv_path}", + "audit_report": f"/{audit_report_path}" + }, + "timestamp": datetime.now().isoformat() + } + + # Convert all numpy/pandas types to native Python types + response_data = convert_to_serializable(response_data) + + return JSONResponse(content=response_data) + + except pd.errors.EmptyDataError: + raise HTTPException(status_code=400, detail="File is empty or invalid CSV format") + except Exception as e: + print(f"Error during cleaning: {str(e)}") + raise HTTPException(status_code=500, detail=f"Cleaning failed: {str(e)}") diff --git a/api/utils/__init__.py b/api/utils/__init__.py new file mode 100644 index 0000000..f88a663 --- /dev/null +++ b/api/utils/__init__.py @@ -0,0 +1 @@ +# Empty __init__.py files for Python package structure diff --git a/data_cleaning/__init__.py b/data_cleaning/__init__.py new file mode 100644 index 0000000..c311fd1 --- /dev/null +++ b/data_cleaning/__init__.py @@ -0,0 +1,12 @@ +""" +Data Cleaning Module +Automated PII detection and GDPR-compliant anonymization +""" + +from .cleaner import DataCleaner + +__version__ = '1.0.0' + +__all__ = [ + 'DataCleaner' +] diff --git a/cleaning.py b/data_cleaning/cleaner.py similarity index 100% rename from cleaning.py rename to data_cleaning/cleaner.py diff --git a/cleaning_config.py b/data_cleaning/config.py similarity index 100% rename from cleaning_config.py rename to data_cleaning/config.py diff --git a/frontend/nordic-privacy-ai/.gitignore b/frontend/.gitignore similarity index 100% rename from frontend/nordic-privacy-ai/.gitignore rename to frontend/.gitignore diff --git a/frontend/nordic-privacy-ai/app/globals.css b/frontend/app/globals.css similarity index 100% rename from frontend/nordic-privacy-ai/app/globals.css rename to frontend/app/globals.css diff --git a/frontend/nordic-privacy-ai/app/layout.tsx b/frontend/app/layout.tsx similarity index 100% rename from frontend/nordic-privacy-ai/app/layout.tsx rename to frontend/app/layout.tsx diff --git a/frontend/nordic-privacy-ai/app/page.tsx b/frontend/app/page.tsx similarity index 100% rename from frontend/nordic-privacy-ai/app/page.tsx rename to frontend/app/page.tsx diff --git a/frontend/nordic-privacy-ai/app/try/page.tsx b/frontend/app/try/page.tsx similarity index 85% rename from frontend/nordic-privacy-ai/app/try/page.tsx rename to frontend/app/try/page.tsx index 66c51f5..bf2e24c 100644 --- a/frontend/nordic-privacy-ai/app/try/page.tsx +++ b/frontend/app/try/page.tsx @@ -14,10 +14,10 @@ export default function TryPage() {
-
+
setTab("bias-analysis")} />
); -} +} \ No newline at end of file diff --git a/frontend/nordic-privacy-ai/components/Navbar.tsx b/frontend/components/Navbar.tsx similarity index 100% rename from frontend/nordic-privacy-ai/components/Navbar.tsx rename to frontend/components/Navbar.tsx diff --git a/frontend/nordic-privacy-ai/components/common/Reveal.tsx b/frontend/components/common/Reveal.tsx similarity index 100% rename from frontend/nordic-privacy-ai/components/common/Reveal.tsx rename to frontend/components/common/Reveal.tsx diff --git a/frontend/nordic-privacy-ai/components/landing/AgentsOverview.tsx b/frontend/components/landing/AgentsOverview.tsx similarity index 100% rename from frontend/nordic-privacy-ai/components/landing/AgentsOverview.tsx rename to frontend/components/landing/AgentsOverview.tsx diff --git a/frontend/nordic-privacy-ai/components/landing/FeatureGrid.tsx b/frontend/components/landing/FeatureGrid.tsx similarity index 100% rename from frontend/nordic-privacy-ai/components/landing/FeatureGrid.tsx rename to frontend/components/landing/FeatureGrid.tsx diff --git a/frontend/nordic-privacy-ai/components/landing/Footer.tsx b/frontend/components/landing/Footer.tsx similarity index 100% rename from frontend/nordic-privacy-ai/components/landing/Footer.tsx rename to frontend/components/landing/Footer.tsx diff --git a/frontend/nordic-privacy-ai/components/landing/Steps.tsx b/frontend/components/landing/Steps.tsx similarity index 100% rename from frontend/nordic-privacy-ai/components/landing/Steps.tsx rename to frontend/components/landing/Steps.tsx diff --git a/frontend/nordic-privacy-ai/components/landing/ValueProps.tsx b/frontend/components/landing/ValueProps.tsx similarity index 100% rename from frontend/nordic-privacy-ai/components/landing/ValueProps.tsx rename to frontend/components/landing/ValueProps.tsx diff --git a/frontend/components/try/CenterPanel.tsx b/frontend/components/try/CenterPanel.tsx new file mode 100644 index 0000000..1e44eb7 --- /dev/null +++ b/frontend/components/try/CenterPanel.tsx @@ -0,0 +1,620 @@ +"use client"; +import { TryTab } from "./Sidebar"; +import { useState, useRef, useCallback, useEffect } from "react"; +import { saveLatestUpload, getLatestUpload, deleteLatestUpload } from "../../lib/indexeddb"; +import { analyzeDataset, cleanDataset, getReportUrl, type AnalyzeResponse, type CleanResponse } from "../../lib/api"; + +interface CenterPanelProps { + tab: TryTab; + onAnalyze?: () => void; +} + +interface UploadedFileMeta { + name: string; + size: number; + type: string; + contentPreview: string; +} + +interface TablePreviewData { + headers: string[]; + rows: string[][]; + origin: 'csv'; +} + +export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) { + const PREVIEW_BYTES = 64 * 1024; // read first 64KB slice for large-file preview + const [fileMeta, setFileMeta] = useState(null); + const [uploadedFile, setUploadedFile] = useState(null); + const [isDragging, setIsDragging] = useState(false); + const [progress, setProgress] = useState(0); + const [progressLabel, setProgressLabel] = useState("Processing"); + const [tablePreview, setTablePreview] = useState(null); + const inputRef = useRef(null); + const [loadedFromCache, setLoadedFromCache] = useState(false); + const [isProcessing, setIsProcessing] = useState(false); + const [error, setError] = useState(null); + + // Analysis results + const [analyzeResult, setAnalyzeResult] = useState(null); + const [cleanResult, setCleanResult] = useState(null); + + const reset = () => { + setFileMeta(null); + setUploadedFile(null); + setProgress(0); + setProgressLabel("Processing"); + setTablePreview(null); + setError(null); + }; + + // Handle API calls + const handleAnalyze = async () => { + if (!uploadedFile) { + setError("No file uploaded"); + return; + } + + setIsProcessing(true); + setError(null); + setProgressLabel("Analyzing dataset..."); + + try { + const result = await analyzeDataset(uploadedFile); + setAnalyzeResult(result); + setProgressLabel("Analysis complete!"); + onAnalyze?.(); // Navigate to bias-analysis tab + } catch (err: any) { + setError(err.message || "Analysis failed"); + } finally { + setIsProcessing(false); + } + }; + + const handleClean = async () => { + if (!uploadedFile) { + setError("No file uploaded"); + return; + } + + setIsProcessing(true); + setError(null); + setProgressLabel("Cleaning dataset..."); + + try { + const result = await cleanDataset(uploadedFile); + setCleanResult(result); + setProgressLabel("Cleaning complete!"); + } catch (err: any) { + setError(err.message || "Cleaning failed"); + } finally { + setIsProcessing(false); + } + }; function tryParseCSV(text: string, maxRows = 50, maxCols = 40): TablePreviewData | null { + const lines = text.split(/\r?\n/).filter(l => l.trim().length > 0); + if (lines.length < 2) return null; + const commaDensity = lines.slice(0, 10).filter(l => l.includes(',')).length; + if (commaDensity < 2) return null; + const parseLine = (line: string) => { + const out: string[] = []; + let cur = ''; + let inQuotes = false; + for (let i = 0; i < line.length; i++) { + const ch = line[i]; + if (ch === '"') { + if (inQuotes && line[i + 1] === '"') { cur += '"'; i++; } else { inQuotes = !inQuotes; } + } else if (ch === ',' && !inQuotes) { + out.push(cur); + cur = ''; + } else { cur += ch; } + } + out.push(cur); + return out.map(c => c.trim()); + }; + const raw = lines.slice(0, maxRows).map(parseLine); + if (raw.length === 0) return null; + const headers = raw[0]; + const colCount = Math.min(headers.length, maxCols); + const rows = raw.slice(1).map(r => r.slice(0, colCount)); + return { headers: headers.slice(0, colCount), rows, origin: 'csv' }; + } + + // We no longer build table preview for JSON; revert JSON to raw text view. + + const processFile = useCallback(async (f: File) => { + if (!f) return; + const isCSV = /\.csv$/i.test(f.name); + setProgress(0); + setUploadedFile(f); // Save the file for API calls + + // For large files, show a progress bar while reading the file stream (no preview) + if (f.size > 1024 * 1024) { + setProgressLabel("Uploading"); + const metaObj: UploadedFileMeta = { + name: f.name, + size: f.size, + type: f.type || "unknown", + contentPreview: `Loading partial preview (first ${Math.round(PREVIEW_BYTES/1024)}KB)...`, + }; + setFileMeta(metaObj); + setTablePreview(null); + // Save to IndexedDB immediately so it persists without needing full read + (async () => { + try { await saveLatestUpload(f, metaObj); } catch {} + })(); + // Read head slice for partial preview & possible CSV table extraction + try { + const headBlob = f.slice(0, PREVIEW_BYTES); + const headReader = new FileReader(); + headReader.onload = async () => { + try { + const buf = headReader.result as ArrayBuffer; + const decoder = new TextDecoder(); + const text = decoder.decode(buf); + setFileMeta(prev => prev ? { ...prev, contentPreview: text.slice(0, 4000) } : prev); + if (isCSV) { + const parsed = tryParseCSV(text); + setTablePreview(parsed); + } else { + setTablePreview(null); + } + try { await saveLatestUpload(f, { ...metaObj, contentPreview: text.slice(0, 4000) }); } catch {} + } catch { /* ignore */ } + }; + headReader.readAsArrayBuffer(headBlob); + } catch { /* ignore */ } + // Use streaming read for progress without buffering entire file in memory + try { + const stream: ReadableStream | undefined = (typeof (f as any).stream === "function" ? (f as any).stream() : undefined); + if (stream && typeof stream.getReader === "function") { + const reader = stream.getReader(); + let loaded = 0; + const total = f.size || 1; + for (;;) { + const { done, value } = await reader.read(); + if (done) break; + loaded += value ? value.length : 0; + const pct = Math.min(100, Math.round((loaded / total) * 100)); + setProgress(pct); + } + setProgress(100); + } else { + // Fallback to FileReader progress events + const reader = new FileReader(); + reader.onprogress = (evt) => { + if (evt.lengthComputable) { + const pct = Math.min(100, Math.round((evt.loaded / evt.total) * 100)); + setProgress(pct); + } else { + setProgress((p) => (p < 90 ? p + 5 : p)); + } + }; + reader.onloadend = () => setProgress(100); + reader.onerror = () => setProgress(0); + reader.readAsArrayBuffer(f); + } + } catch { + setProgress(100); + } + return; + } + const reader = new FileReader(); + reader.onprogress = (evt) => { + if (evt.lengthComputable) { + const pct = Math.min(100, Math.round((evt.loaded / evt.total) * 100)); + setProgress(pct); + } else { + setProgress((p) => (p < 90 ? p + 5 : p)); + } + }; + reader.onload = async () => { + try { + const buf = reader.result as ArrayBuffer; + const decoder = new TextDecoder(); + const text = decoder.decode(buf); + const metaObj: UploadedFileMeta = { + name: f.name, + size: f.size, + type: f.type || "unknown", + contentPreview: text.slice(0, 4000), + }; + setFileMeta(metaObj); + if (isCSV) { + const parsed = tryParseCSV(text); + setTablePreview(parsed); + } else { + setTablePreview(null); + } + // Save file blob and meta to browser cache (IndexedDB) + try { + await saveLatestUpload(f, metaObj); + } catch {} + setProgressLabel("Processing"); + setProgress(100); + } catch (e) { + const metaObj: UploadedFileMeta = { + name: f.name, + size: f.size, + type: f.type || "unknown", + contentPreview: "Unable to decode preview.", + }; + setFileMeta(metaObj); + setTablePreview(null); + try { + await saveLatestUpload(f, metaObj); + } catch {} + setProgressLabel("Processing"); + setProgress(100); + } + }; + reader.onerror = () => { + setProgress(0); + }; + reader.readAsArrayBuffer(f); + }, []); + + function handleFileChange(e: React.ChangeEvent) { + const f = e.target.files?.[0]; + processFile(f as File); + } + + const onDragOver = (e: React.DragEvent) => { + e.preventDefault(); + setIsDragging(true); + }; + const onDragLeave = () => setIsDragging(false); + const onDrop = (e: React.DragEvent) => { + e.preventDefault(); + setIsDragging(false); + const f = e.dataTransfer.files?.[0]; + processFile(f as File); + }; + + // Load last cached upload on mount (processing tab only) + useEffect(() => { + let ignore = false; + if (tab !== "processing") return; + (async () => { + try { + const { file, meta } = await getLatestUpload(); + if (!ignore && meta) { + setFileMeta(meta as UploadedFileMeta); + if (file) { + setUploadedFile(file); + } + setLoadedFromCache(true); + } + } catch {} + })(); + return () => { + ignore = true; + }; + }, [tab]); function renderTabContent() { + switch (tab) { + case "processing": + return ( +
+

Upload & Process Data

+

Upload a CSV / JSON / text file. We will later parse, detect PII, and queue analyses.

+
+
+

Drag & drop a CSV / JSON / TXT here, or click to browse.

+
+ +
+
+ + {progress > 0 && ( +
+
+
+
+
{progressLabel} {progress}%
+
+ )} + {fileMeta && ( +
+
+
{fileMeta.name}
+
{Math.round(fileMeta.size / 1024)} KB
+
+ {loadedFromCache && ( +
Loaded from browser cache
+ )} +
{fileMeta.type || "Unknown type"}
+ {/* Table preview when structured data detected; otherwise show text */} + {tablePreview && tablePreview.origin === 'csv' ? ( +
+ + + + {tablePreview.headers.map((h, idx) => ( + + ))} + + + + {tablePreview.rows.map((r, i) => ( + + {r.map((c, j) => ( + + ))} + + ))} + +
{h}
{c}
+
+ ) : ( +
+														{fileMeta.contentPreview || "(no preview)"}
+													
+ )} + + {error && ( +
+ ❌ {error} +
+ )} + + {analyzeResult && ( +
+ βœ… Analysis complete! View results in tabs. + + Download Report + +
+ )} + + {cleanResult && ( +
+ βœ… Cleaning complete! {cleanResult.summary.total_cells_affected} cells anonymized. + +
+ )} + +
+ + + +
+
+ )} +
+
+ ); + case "bias-analysis": + return ( +
+

Bias Analysis

+ {analyzeResult ? ( +
+
+
+
Overall Bias Score
+
{(analyzeResult.bias_metrics.overall_bias_score * 100).toFixed(1)}%
+
+
+
Violations Detected
+
{analyzeResult.bias_metrics.violations_detected.length}
+
+
+ +
+

Model Performance

+
+
+
Accuracy
+
{(analyzeResult.model_performance.accuracy * 100).toFixed(1)}%
+
+
+
Precision
+
{(analyzeResult.model_performance.precision * 100).toFixed(1)}%
+
+
+
Recall
+
{(analyzeResult.model_performance.recall * 100).toFixed(1)}%
+
+
+
F1 Score
+
{(analyzeResult.model_performance.f1_score * 100).toFixed(1)}%
+
+
+
+
+ ) : ( +

Upload and analyze a dataset to see bias metrics.

+ )} +
+ ); + case "risk-analysis": + return ( +
+

Risk Analysis

+ {analyzeResult ? ( +
+
+
Overall Risk Score
+
{(analyzeResult.risk_assessment.overall_risk_score * 100).toFixed(1)}%
+
+ + {cleanResult && ( +
+

PII Detection Results

+
+
Cells Anonymized: {cleanResult.summary.total_cells_affected}
+
Columns Removed: {cleanResult.summary.columns_removed.length}
+
Columns Anonymized: {cleanResult.summary.columns_anonymized.length}
+
+
+ )} +
+ ) : ( +

Upload and analyze a dataset to see risk assessment.

+ )} +
+ ); + case "bias-risk-mitigation": + return ( +
+

Mitigation Suggestions

+ {analyzeResult && analyzeResult.recommendations.length > 0 ? ( +
+ {analyzeResult.recommendations.map((rec, i) => ( +
+ {rec} +
+ ))} +
+ ) : ( +

+ Recommendations will appear here after analysis. +

+ )} +
+ ); + case "results": + return ( +
+

Results Summary

+ {(analyzeResult || cleanResult) ? ( +
+ {analyzeResult && ( +
+

Analysis Results

+
+
Dataset: {analyzeResult.filename}
+
Rows: {analyzeResult.dataset_info.rows}
+
Columns: {analyzeResult.dataset_info.columns}
+
Bias Score: {(analyzeResult.bias_metrics.overall_bias_score * 100).toFixed(1)}%
+
Risk Score: {(analyzeResult.risk_assessment.overall_risk_score * 100).toFixed(1)}%
+
+ + Download Full Report β†’ + +
+ )} + + {cleanResult && ( +
+

Cleaning Results

+
+
Original: {cleanResult.dataset_info.original_rows} rows Γ— {cleanResult.dataset_info.original_columns} cols
+
Cleaned: {cleanResult.dataset_info.cleaned_rows} rows Γ— {cleanResult.dataset_info.cleaned_columns} cols
+
Cells Anonymized: {cleanResult.summary.total_cells_affected}
+
Columns Removed: {cleanResult.summary.columns_removed.length}
+
GDPR Compliant: {cleanResult.gdpr_compliance.length} articles applied
+
+ +
+ )} +
+ ) : ( +

+ Process a dataset to see aggregated results. +

+ )} +
+ ); + default: + return null; + } + } + + return ( +
+ {renderTabContent()} +
+ ); +} \ No newline at end of file diff --git a/frontend/nordic-privacy-ai/components/try/CenterPanel.tsx b/frontend/components/try/CenterPanel_BACKUP.tsx similarity index 66% rename from frontend/nordic-privacy-ai/components/try/CenterPanel.tsx rename to frontend/components/try/CenterPanel_BACKUP.tsx index e8239c7..3c587b2 100644 --- a/frontend/nordic-privacy-ai/components/try/CenterPanel.tsx +++ b/frontend/components/try/CenterPanel_BACKUP.tsx @@ -4,6 +4,7 @@ import { useState, useRef, useCallback, useEffect } from "react"; interface CenterPanelProps { tab: TryTab; + onAnalyze?: () => void; } interface UploadedFileMeta { @@ -13,11 +14,19 @@ interface UploadedFileMeta { contentPreview: string; } -export function CenterPanel({ tab }: CenterPanelProps) { +interface TablePreviewData { + headers: string[]; + rows: string[][]; + origin: 'csv'; +} + +export function CenterPanel({ tab, onAnalyze }: CenterPanelProps) { + const PREVIEW_BYTES = 64 * 1024; // read first 64KB slice for large-file preview const [fileMeta, setFileMeta] = useState(null); const [isDragging, setIsDragging] = useState(false); const [progress, setProgress] = useState(0); const [progressLabel, setProgressLabel] = useState("Processing"); + const [tablePreview, setTablePreview] = useState(null); const inputRef = useRef(null); const [loadedFromCache, setLoadedFromCache] = useState(false); @@ -25,10 +34,43 @@ export function CenterPanel({ tab }: CenterPanelProps) { setFileMeta(null); setProgress(0); setProgressLabel("Processing"); + setTablePreview(null); }; + function tryParseCSV(text: string, maxRows = 50, maxCols = 40): TablePreviewData | null { + const lines = text.split(/\r?\n/).filter(l => l.trim().length > 0); + if (lines.length < 2) return null; + const commaDensity = lines.slice(0, 10).filter(l => l.includes(',')).length; + if (commaDensity < 2) return null; + const parseLine = (line: string) => { + const out: string[] = []; + let cur = ''; + let inQuotes = false; + for (let i = 0; i < line.length; i++) { + const ch = line[i]; + if (ch === '"') { + if (inQuotes && line[i + 1] === '"') { cur += '"'; i++; } else { inQuotes = !inQuotes; } + } else if (ch === ',' && !inQuotes) { + out.push(cur); + cur = ''; + } else { cur += ch; } + } + out.push(cur); + return out.map(c => c.trim()); + }; + const raw = lines.slice(0, maxRows).map(parseLine); + if (raw.length === 0) return null; + const headers = raw[0]; + const colCount = Math.min(headers.length, maxCols); + const rows = raw.slice(1).map(r => r.slice(0, colCount)); + return { headers: headers.slice(0, colCount), rows, origin: 'csv' }; + } + + // We no longer build table preview for JSON; revert JSON to raw text view. + const processFile = useCallback(async (f: File) => { if (!f) return; + const isCSV = /\.csv$/i.test(f.name); setProgress(0); // For large files, show a progress bar while reading the file stream (no preview) if (f.size > 1024 * 1024) { @@ -37,13 +79,35 @@ export function CenterPanel({ tab }: CenterPanelProps) { name: f.name, size: f.size, type: f.type || "unknown", - contentPreview: "File too large for preview (limit 1MB).", + contentPreview: `Loading partial preview (first ${Math.round(PREVIEW_BYTES/1024)}KB)...`, }; setFileMeta(metaObj); + setTablePreview(null); // Save to IndexedDB immediately so it persists without needing full read (async () => { try { await saveLatestUpload(f, metaObj); } catch {} })(); + // Read head slice for partial preview & possible CSV table extraction + try { + const headBlob = f.slice(0, PREVIEW_BYTES); + const headReader = new FileReader(); + headReader.onload = async () => { + try { + const buf = headReader.result as ArrayBuffer; + const decoder = new TextDecoder(); + const text = decoder.decode(buf); + setFileMeta(prev => prev ? { ...prev, contentPreview: text.slice(0, 4000) } : prev); + if (isCSV) { + const parsed = tryParseCSV(text); + setTablePreview(parsed); + } else { + setTablePreview(null); + } + try { await saveLatestUpload(f, { ...metaObj, contentPreview: text.slice(0, 4000) }); } catch {} + } catch { /* ignore */ } + }; + headReader.readAsArrayBuffer(headBlob); + } catch { /* ignore */ } // Use streaming read for progress without buffering entire file in memory try { const stream: ReadableStream | undefined = (typeof (f as any).stream === "function" ? (f as any).stream() : undefined); @@ -100,6 +164,12 @@ export function CenterPanel({ tab }: CenterPanelProps) { contentPreview: text.slice(0, 4000), }; setFileMeta(metaObj); + if (isCSV) { + const parsed = tryParseCSV(text); + setTablePreview(parsed); + } else { + setTablePreview(null); + } // Save file blob and meta to browser cache (IndexedDB) try { await saveLatestUpload(f, metaObj); @@ -114,6 +184,7 @@ export function CenterPanel({ tab }: CenterPanelProps) { contentPreview: "Unable to decode preview.", }; setFileMeta(metaObj); + setTablePreview(null); try { await saveLatestUpload(f, metaObj); } catch {} @@ -163,13 +234,13 @@ export function CenterPanel({ tab }: CenterPanelProps) { }, [tab]); function renderTabContent() { - switch (tab) { + switch (tab) { case "processing": - return ( -
+ return ( +

Upload & Process Data

Upload a CSV / JSON / text file. We will later parse, detect PII, and queue analyses.

-
+
Loaded from browser cache
)}
{fileMeta.type || "Unknown type"}
-
-										{fileMeta.contentPreview || "(no preview)"}
-									
-
+ {/* Table preview when structured data detected; otherwise show text */} + {tablePreview && tablePreview.origin === 'csv' ? ( +
+ + + + {tablePreview.headers.map((h, idx) => ( + + ))} + + + + {tablePreview.rows.map((r, i) => ( + + {r.map((c, j) => ( + + ))} + + ))} + +
{h}
{c}
+
+ ) : ( +
+														{fileMeta.contentPreview || "(no preview)"}
+													
+ )} +
+
)} diff --git a/frontend/nordic-privacy-ai/components/try/ChatbotPanel.tsx b/frontend/components/try/ChatbotPanel.tsx similarity index 100% rename from frontend/nordic-privacy-ai/components/try/ChatbotPanel.tsx rename to frontend/components/try/ChatbotPanel.tsx diff --git a/frontend/nordic-privacy-ai/components/try/Sidebar.tsx b/frontend/components/try/Sidebar.tsx similarity index 100% rename from frontend/nordic-privacy-ai/components/try/Sidebar.tsx rename to frontend/components/try/Sidebar.tsx diff --git a/frontend/nordic-privacy-ai/next.config.mjs b/frontend/next.config.mjs similarity index 100% rename from frontend/nordic-privacy-ai/next.config.mjs rename to frontend/next.config.mjs diff --git a/frontend/nordic-privacy-ai/README.md b/frontend/nordic-privacy-ai/README.md deleted file mode 100644 index b424092..0000000 --- a/frontend/nordic-privacy-ai/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# Nordic Privacy AI - -AI-Powered GDPR compliance & personal data protection platform tailored for Nordic ecosystems (BankID, MitID, Suomi.fi). - -## Tech Stack -- Next.js (App Router, TypeScript) -- Tailwind CSS - -## Getting Started -```powershell -npm install -npm run dev -``` -Visit http://localhost:3000 to view the landing page. - -## Scripts -- `npm run dev` – Start dev server -- `npm run build` – Production build -- `npm start` – Run built app -- `npm run lint` – ESLint - -## Next Steps -- Implement /try page workflow -- Add feature sections & agent explanations -- Integrate backend services for data upload & scanning - -## License -Internal hackathon prototype diff --git a/frontend/nordic-privacy-ai/lib/idb.ts b/frontend/nordic-privacy-ai/lib/idb.ts new file mode 100644 index 0000000..507b85a --- /dev/null +++ b/frontend/nordic-privacy-ai/lib/idb.ts @@ -0,0 +1,113 @@ +/** + * IndexedDB utilities for persisting file uploads in the browser. + * Stores the latest uploaded file and its metadata for recovery across sessions. + */ + +const DB_NAME = "NordicPrivacyAI"; +const DB_VERSION = 1; +const STORE_NAME = "latestUpload"; + +interface UploadedFileMeta { + name: string; + size: number; + type: string; + contentPreview: string; +} + +interface LatestUploadData { + file: File; + meta: UploadedFileMeta; + timestamp: number; +} + +/** + * Open or create the IndexedDB database + */ +function openDB(): Promise { + return new Promise((resolve, reject) => { + const request = indexedDB.open(DB_NAME, DB_VERSION); + + request.onerror = () => reject(request.error); + request.onsuccess = () => resolve(request.result); + + request.onupgradeneeded = (event) => { + const db = (event.target as IDBOpenDBRequest).result; + + // Create object store if it doesn't exist + if (!db.objectStoreNames.contains(STORE_NAME)) { + db.createObjectStore(STORE_NAME); + } + }; + }); +} + +/** + * Save the latest uploaded file and its metadata to IndexedDB + */ +export async function saveLatestUpload( + file: File, + meta: UploadedFileMeta +): Promise { + const db = await openDB(); + + return new Promise((resolve, reject) => { + const transaction = db.transaction([STORE_NAME], "readwrite"); + const store = transaction.objectStore(STORE_NAME); + + const data: LatestUploadData = { + file, + meta, + timestamp: Date.now(), + }; + + const request = store.put(data, "latest"); + + request.onerror = () => reject(request.error); + request.onsuccess = () => resolve(); + + transaction.oncomplete = () => db.close(); + }); +} + +/** + * Retrieve the latest uploaded file and metadata from IndexedDB + */ +export async function getLatestUpload(): Promise { + const db = await openDB(); + + return new Promise((resolve, reject) => { + const transaction = db.transaction([STORE_NAME], "readonly"); + const store = transaction.objectStore(STORE_NAME); + const request = store.get("latest"); + + request.onerror = () => reject(request.error); + request.onsuccess = () => { + const result = request.result as LatestUploadData | undefined; + if (result) { + resolve(result); + } else { + reject(new Error("No cached upload found")); + } + }; + + transaction.oncomplete = () => db.close(); + }); +} + +/** + * Delete the latest upload from IndexedDB + */ +export async function deleteLatestUpload(): Promise { + const db = await openDB(); + + return new Promise((resolve, reject) => { + const transaction = db.transaction([STORE_NAME], "readwrite"); + const store = transaction.objectStore(STORE_NAME); + const request = store.delete("latest"); + + request.onerror = () => reject(request.error); + request.onsuccess = () => resolve(); + + transaction.oncomplete = () => db.close(); + }); +} diff --git a/frontend/nordic-privacy-ai/next-env.d.ts b/frontend/nordic-privacy-ai/next-env.d.ts deleted file mode 100644 index 4f11a03..0000000 --- a/frontend/nordic-privacy-ai/next-env.d.ts +++ /dev/null @@ -1,5 +0,0 @@ -/// -/// - -// NOTE: This file should not be edited -// see https://nextjs.org/docs/basic-features/typescript for more information. diff --git a/frontend/nordic-privacy-ai/package-lock.json b/frontend/package-lock.json similarity index 100% rename from frontend/nordic-privacy-ai/package-lock.json rename to frontend/package-lock.json diff --git a/frontend/nordic-privacy-ai/package.json b/frontend/package.json similarity index 100% rename from frontend/nordic-privacy-ai/package.json rename to frontend/package.json diff --git a/frontend/nordic-privacy-ai/postcss.config.js b/frontend/postcss.config.js similarity index 100% rename from frontend/nordic-privacy-ai/postcss.config.js rename to frontend/postcss.config.js diff --git a/frontend/nordic-privacy-ai/tailwind.config.ts b/frontend/tailwind.config.ts similarity index 100% rename from frontend/nordic-privacy-ai/tailwind.config.ts rename to frontend/tailwind.config.ts diff --git a/frontend/nordic-privacy-ai/tsconfig.json b/frontend/tsconfig.json similarity index 100% rename from frontend/nordic-privacy-ai/tsconfig.json rename to frontend/tsconfig.json diff --git a/requirements.txt b/requirements.txt index e69a34b..be42f1d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,15 @@ setuptools>=65.0.0 presidio-analyzer>=2.2.0 presidio-anonymizer>=2.2.0 spacy>=3.7.0 -# Download spaCy model with: python -m spacy download en_core_web_sm \ No newline at end of file +# Download spaCy model with: python -m spacy download en_core_web_sm + +# FastAPI Backend +fastapi>=0.109.0 +uvicorn[standard]>=0.27.0 +python-multipart>=0.0.6 + +# Optional: GPU Support (uncomment if you have CUDA) +# torch>=2.0.0 --index-url https://download.pytorch.org/whl/cu121 + +# Chatbot (WIP - not exposed in API yet) +gpt4all>=2.0.0 \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py deleted file mode 100644 index fa022e1..0000000 --- a/src/__init__.py +++ /dev/null @@ -1,145 +0,0 @@ -""" -AI Governance Module - Bias Detection and Risk Analysis -""" - -from .data_processor import DataProcessor -from .model_trainer import GeneralizedModelTrainer -from .bias_analyzer import BiasAnalyzer -from .risk_analyzer import RiskAnalyzer -from .report_generator import ReportGenerator, NumpyEncoder - -import pandas as pd -import json - -__version__ = '1.0.0' - -__all__ = [ - 'DataProcessor', - 'GeneralizedModelTrainer', - 'BiasAnalyzer', - 'RiskAnalyzer', - 'ReportGenerator', - 'NumpyEncoder', - 'AIGovernanceAnalyzer' -] - - -class AIGovernanceAnalyzer: - """ - Main interface for AI Governance analysis - - Example: - >>> analyzer = AIGovernanceAnalyzer() - >>> report = analyzer.analyze('data.csv', 'target', ['gender', 'age']) - >>> print(f"Bias Score: {report['summary']['overall_bias_score']:.3f}") - """ - - def __init__(self): - """Initialize the analyzer""" - self.processor = None - self.trainer = None - self.bias_analyzer = None - self.risk_analyzer = None - self.report_generator = None - - def analyze(self, data_path, target_column, protected_attributes): - """ - Run complete AI governance analysis from file - - Args: - data_path (str): Path to CSV file - target_column (str): Name of target column - protected_attributes (list): List of protected attribute column names - - Returns: - dict: Complete analysis report - """ - df = pd.read_csv(data_path) - return self.analyze_dataframe(df, target_column, protected_attributes) - - def analyze_dataframe(self, df, target_column, protected_attributes): - """ - Run complete AI governance analysis from DataFrame - - Args: - df (pd.DataFrame): Input dataframe - target_column (str): Name of target column - protected_attributes (list): List of protected attribute column names - - Returns: - dict: Complete analysis report - """ - # Step 1: Process data - self.processor = DataProcessor(df) - self.processor.target_column = target_column - self.processor.protected_attributes = protected_attributes - self.processor.prepare_data() - - # Step 2: Train model - self.trainer = GeneralizedModelTrainer( - self.processor.X_train, - self.processor.X_test, - self.processor.y_train, - self.processor.y_test, - self.processor.feature_names - ) - self.trainer.train() - self.trainer.evaluate() - - # Step 3: Analyze bias - self.bias_analyzer = BiasAnalyzer( - self.processor.X_test, - self.processor.y_test, - self.trainer.y_pred, - self.processor.df, - self.processor.protected_attributes, - self.processor.target_column - ) - bias_results = self.bias_analyzer.analyze() - - # Step 4: Assess risks - self.risk_analyzer = RiskAnalyzer( - self.processor.df, - self.trainer.results, - bias_results, - self.processor.protected_attributes, - self.processor.target_column - ) - risk_results = self.risk_analyzer.analyze() - - # Step 5: Generate report - self.report_generator = ReportGenerator( - self.trainer.results, - bias_results, - risk_results, - self.processor.df - ) - - return self.report_generator.generate_report() - - def save_report(self, report, output_path): - """ - Save report to JSON file - - Args: - report (dict): Analysis report - output_path (str): Path to save JSON file - - Returns: - str: Path to saved file - """ - with open(output_path, 'w') as f: - json.dump(report, f, indent=2, cls=NumpyEncoder) - return output_path - - def get_summary(self, report): - """ - Get executive summary from report - - Args: - report (dict): Analysis report - - Returns: - dict: Summary metrics - """ - return report.get('summary', {}) diff --git a/src/report_generator.py b/src/report_generator.py deleted file mode 100644 index 3b41cbd..0000000 --- a/src/report_generator.py +++ /dev/null @@ -1,263 +0,0 @@ -""" -Report Generator Module -Generates comprehensive JSON reports -""" - -import json -import numpy as np -from datetime import datetime - -class NumpyEncoder(json.JSONEncoder): - """Custom JSON encoder for numpy types""" - def default(self, obj): - if isinstance(obj, (np.integer, np.int64, np.int32)): - return int(obj) - elif isinstance(obj, (np.floating, np.float64, np.float32)): - return float(obj) - elif isinstance(obj, (np.ndarray,)): - return obj.tolist() - elif isinstance(obj, (np.bool_,)): - return bool(obj) - return super(NumpyEncoder, self).default(obj) - -class ReportGenerator: - """Generate comprehensive analysis reports""" - - def __init__(self, model_results, bias_results, risk_results, df): - self.model_results = model_results - self.bias_results = bias_results - self.risk_results = risk_results - self.df = df - - def generate_report(self): - """Generate comprehensive JSON report""" - report = { - 'metadata': self._generate_metadata(), - 'summary': self._generate_summary(), - 'model_performance': self._format_model_results(), - 'bias_analysis': self._format_bias_results(), - 'risk_assessment': self._format_risk_results(), - 'key_findings': self._extract_key_findings(), - 'recommendations': self._compile_recommendations(), - 'detailed_metrics': self._compile_detailed_metrics() - } - - return report - - def _generate_metadata(self): - """Generate report metadata""" - return { - 'report_id': f"AIGov_{datetime.now().strftime('%Y%m%d_%H%M%S')}", - 'generated_at': datetime.now().isoformat(), - 'report_version': '1.0', - 'dataset_info': { - 'total_records': len(self.df), - 'total_features': len(self.df.columns), - 'columns': list(self.df.columns) - } - } - - def _generate_summary(self): - """Generate executive summary""" - model_metrics = self.model_results.get('metrics', {}) - - return { - 'overall_bias_score': self.bias_results.get('overall_bias_score', 0.0), - 'overall_risk_score': self.risk_results.get('overall_risk_score', 0.0), - 'risk_level': self.risk_results.get('risk_level', 'UNKNOWN'), - 'model_accuracy': model_metrics.get('accuracy', 0.0), - 'fairness_violations_count': len(self.bias_results.get('fairness_violations', [])), - 'passes_fairness_threshold': self.bias_results.get('fairness_assessment', {}).get('passes_fairness_threshold', False) - } - - def _format_model_results(self): - """Format model performance results""" - return { - 'model_type': self.model_results.get('model_type', 'Unknown'), - 'metrics': self.model_results.get('metrics', {}), - 'confusion_matrix': self.model_results.get('confusion_matrix', []), - 'top_features': dict(list(self.model_results.get('feature_importance', {}).items())[:10]) - } - - def _format_bias_results(self): - """Format bias analysis results""" - return { - 'overall_bias_score': self.bias_results.get('overall_bias_score', 0.0), - 'fairness_metrics': self.bias_results.get('fairness_metrics', {}), - 'fairness_violations': self.bias_results.get('fairness_violations', []), - 'fairness_assessment': self.bias_results.get('fairness_assessment', {}), - 'demographic_bias_summary': self._summarize_demographic_bias() - } - - def _format_risk_results(self): - """Format risk assessment results""" - return { - 'overall_risk_score': self.risk_results.get('overall_risk_score', 0.0), - 'risk_level': self.risk_results.get('risk_level', 'UNKNOWN'), - 'risk_categories': self.risk_results.get('risk_categories', {}), - 'privacy_risks': self._summarize_privacy_risks(), - 'ethical_risks': self._summarize_ethical_risks() - } - - def _summarize_demographic_bias(self): - """Summarize demographic bias""" - demo_bias = self.bias_results.get('demographic_bias', {}) - summary = {} - - for attr, data in demo_bias.items(): - summary[attr] = { - 'max_disparity': data.get('max_disparity', 0), - 'groups_analyzed': len(data.get('approval_rates', {})) - } - - return summary - - def _summarize_privacy_risks(self): - """Summarize privacy risks""" - privacy = self.risk_results.get('privacy_risks', {}) - - return { - 'pii_count': len(privacy.get('pii_detected', [])), - 'anonymization_level': privacy.get('anonymization_level', 'UNKNOWN'), - 'exposure_risk_count': len(privacy.get('exposure_risks', [])), - 'gdpr_compliance_score': privacy.get('gdpr_compliance', {}).get('compliance_score', 0) - } - - def _summarize_ethical_risks(self): - """Summarize ethical risks""" - ethical = self.risk_results.get('ethical_risks', {}) - - return { - 'fairness_issues_count': len(ethical.get('fairness_issues', [])), - 'transparency_score': ethical.get('transparency_score', 0), - 'bias_amplification_risk': ethical.get('bias_amplification_risk', 'UNKNOWN'), - 'social_impact': ethical.get('social_impact_assessment', {}) - } - - def _extract_key_findings(self): - """Extract key findings from analysis""" - findings = [] - - # Model performance findings - accuracy = self.model_results.get('metrics', {}).get('accuracy', 0) - if accuracy >= 0.8: - findings.append(f"βœ“ Model achieves good accuracy ({accuracy:.2%})") - else: - findings.append(f"⚠ Model accuracy is below optimal ({accuracy:.2%})") - - # Bias findings - bias_score = self.bias_results.get('overall_bias_score', 0) - if bias_score < 0.3: - findings.append("βœ“ Low bias detected across protected attributes") - elif bias_score < 0.5: - findings.append("⚠ Moderate bias detected - monitoring recommended") - else: - findings.append("❌ High bias detected - immediate action required") - - # Fairness violations - violations = self.bias_results.get('fairness_violations', []) - if violations: - high_sev = sum(1 for v in violations if v['severity'] == 'HIGH') - findings.append(f"❌ {len(violations)} fairness violations detected ({high_sev} high severity)") - else: - findings.append("βœ“ No fairness violations detected") - - # Privacy findings - privacy = self.risk_results.get('privacy_risks', {}) - pii_count = len(privacy.get('pii_detected', [])) - if pii_count > 0: - findings.append(f"⚠ {pii_count} columns contain potential PII") - else: - findings.append("βœ“ No obvious PII detected in dataset") - - # Risk level - risk_level = self.risk_results.get('risk_level', 'UNKNOWN') - findings.append(f"Overall Risk Level: {risk_level}") - - return findings - - def _compile_recommendations(self): - """Compile all recommendations""" - recommendations = [] - - # Get recommendations from each component - privacy_recs = self.risk_results.get('privacy_risks', {}).get('recommendations', []) - ethical_recs = self.risk_results.get('ethical_risks', {}).get('recommendations', []) - performance_recs = self.risk_results.get('model_performance_risks', {}).get('recommendations', []) - compliance_recs = self.risk_results.get('compliance_risks', {}).get('recommendations', []) - - # Prioritize recommendations - all_recs = [] - - # High priority (from violations and high risks) - violations = self.bias_results.get('fairness_violations', []) - if violations: - all_recs.append({ - 'priority': 'HIGH', - 'category': 'Fairness', - 'recommendation': 'Address fairness violations in protected attributes' - }) - - if len(privacy_recs) > 0: - all_recs.append({ - 'priority': 'HIGH', - 'category': 'Privacy', - 'recommendation': privacy_recs[0] - }) - - # Medium priority - for rec in ethical_recs[:2]: - all_recs.append({ - 'priority': 'MEDIUM', - 'category': 'Ethics', - 'recommendation': rec - }) - - # Lower priority - for rec in performance_recs[:2]: - all_recs.append({ - 'priority': 'MEDIUM', - 'category': 'Performance', - 'recommendation': rec - }) - - for rec in compliance_recs[:2]: - all_recs.append({ - 'priority': 'MEDIUM', - 'category': 'Compliance', - 'recommendation': rec - }) - - # Convert to simple list with formatting - recommendations = [ - f"[{r['priority']}] {r['category']}: {r['recommendation']}" - for r in all_recs[:10] # Limit to top 10 - ] - - return recommendations - - def _compile_detailed_metrics(self): - """Compile detailed metrics for analysis""" - return { - 'bias_metrics': { - 'by_attribute': self.bias_results.get('fairness_metrics', {}), - 'demographic_analysis': self.bias_results.get('demographic_bias', {}) - }, - 'risk_breakdown': { - 'privacy': self.risk_results.get('privacy_risks', {}), - 'ethical': self.risk_results.get('ethical_risks', {}), - 'compliance': self.risk_results.get('compliance_risks', {}), - 'data_quality': self.risk_results.get('data_quality_risks', {}) - }, - 'model_details': { - 'classification_report': self.model_results.get('classification_report', {}), - 'feature_importance': self.model_results.get('feature_importance', {}) - } - } - - def save_report(self, filepath): - """Save report to JSON file""" - report = self.generate_report() - with open(filepath, 'w') as f: - json.dump(report, f, indent=2, cls=NumpyEncoder) - return filepath diff --git a/start_api.py b/start_api.py new file mode 100644 index 0000000..754bcc8 --- /dev/null +++ b/start_api.py @@ -0,0 +1,21 @@ +""" +Start the FastAPI server +Run: python start_api.py +""" + +import uvicorn + +if __name__ == "__main__": + print("πŸš€ Starting Nordic Privacy AI API Server...") + print("πŸ“ API will be available at: http://localhost:8000") + print("πŸ“– Interactive docs at: http://localhost:8000/docs") + print("πŸ”— Frontend should run at: http://localhost:3000") + print("\nPress CTRL+C to stop\n") + + uvicorn.run( + "api.main:app", + host="0.0.0.0", + port=8000, + reload=True, # Auto-reload on code changes + log_level="info" + ) diff --git a/test_cleaning.py b/test_cleaning.py index bcdfb91..65c9968 100644 --- a/test_cleaning.py +++ b/test_cleaning.py @@ -10,7 +10,7 @@ import os # Add parent directory to path sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from cleaning import DataCleaner, CleaningConfig +from data_cleaning import DataCleaner def test_basic_cleaning():