diff --git a/.gitignore b/.gitignore index 44cf1b7..972f0c5 100644 --- a/.gitignore +++ b/.gitignore @@ -48,9 +48,11 @@ Thumbs.db # Streamlit .streamlit/secrets.toml -# Reports +# Reports and generated files reports/*.json reports/*.pdf +reports/*.csv +reports/*.html # Logs *.log @@ -59,4 +61,17 @@ reports/*.pdf *.csv !Datasets/loan_data.csv +# Node.js & Next.js +node_modules/ +.next/ +out/ +.vercel +*.tsbuildinfo +next-env.d.ts + +# Frontend build artifacts +frontend/nordic-privacy-ai/.next/ +frontend/nordic-privacy-ai/out/ +frontend/nordic-privacy-ai/node_modules/ + Data \ No newline at end of file diff --git a/README.md b/README.md index 8b4af0a..701a09c 100644 --- a/README.md +++ b/README.md @@ -1,39 +1,1029 @@ -# AI Governance Module +# Nordic Privacy AI π‘οΈ# Nordic Privacy AI π‘οΈ# AI Governance Module -A Python package for detecting bias and analyzing risks in machine learning models. Provides comprehensive fairness metrics, privacy risk assessment, and ethical AI evaluation. -## Features -### π― Bias Detection -- **Fairness Metrics**: Disparate Impact, Statistical Parity Difference, Equal Opportunity Difference -- **Demographic Analysis**: Group-wise performance evaluation -- **Violation Detection**: Automatic flagging with severity levels +**AI-Powered GDPR Compliance & Privacy Protection Platform** -### π‘οΈ Risk Assessment -- **Privacy Risks**: PII detection, GDPR compliance, data exposure analysis -- **Ethical Risks**: Fairness, transparency, accountability, social impact -- **Compliance Risks**: Regulatory adherence (GDPR, CCPA, AI Act) -- **Data Quality**: Missing data, class imbalance, outlier detection -### π€ Machine Learning -- Generalized classification model (works with any dataset) -- Auto-detection of feature types and protected attributes -- Comprehensive performance metrics + +A comprehensive solution for AI governance, bias detection, risk assessment, and automated PII cleaning with GDPR compliance. Built for Nordic ecosystems and beyond.**AI-Powered GDPR Compliance & Privacy Protection Platform**A Python package for detecting bias and analyzing risks in machine learning models. Provides comprehensive fairness metrics, privacy risk assessment, and ethical AI evaluation. + + + +[](https://www.python.org/) + +[](https://fastapi.tiangolo.com/) + +[](https://nextjs.org/)A comprehensive solution for AI governance, bias detection, risk assessment, and automated PII cleaning with GDPR compliance. Built for Nordic ecosystems and beyond.## Features + +[](LICENSE) + + + +--- + +[](https://www.python.org/)### π― Bias Detection + +## π Quick Start + +[](https://fastapi.tiangolo.com/)- **Fairness Metrics**: Disparate Impact, Statistical Parity Difference, Equal Opportunity Difference + +### Prerequisites + +- Python 3.8+[](https://nextjs.org/)- **Demographic Analysis**: Group-wise performance evaluation + +- Node.js 18+ + +- GPU (optional, for faster processing)[](LICENSE)- **Violation Detection**: Automatic flagging with severity levels + + + +### Installation + + + +1. **Clone the repository**---### π‘οΈ Risk Assessment + +```powershell + +git clone https://github.com/PlatypusPus/MushroomEmpire.git- **Privacy Risks**: PII detection, GDPR compliance, data exposure analysis + +cd MushroomEmpire + +```## π Quick Start- **Ethical Risks**: Fairness, transparency, accountability, social impact + + + +2. **Install Python dependencies**- **Compliance Risks**: Regulatory adherence (GDPR, CCPA, AI Act) + +```powershell + +pip install -r requirements.txt### Prerequisites- **Data Quality**: Missing data, class imbalance, outlier detection + +python -m spacy download en_core_web_sm + +```- Python 3.8+ + + + +3. **Install frontend dependencies**- Node.js 18+### π€ Machine Learning + +```powershell + +cd frontend- GPU (optional, for faster processing)- Generalized classification model (works with any dataset) + +npm install + +cd ..- Auto-detection of feature types and protected attributes + +``` + +### Installation- Comprehensive performance metrics + +### Running the Application + - Feature importance analysis -## Installation +1. **Start the FastAPI backend** (Terminal 1) + +```powershell1. **Clone the repository** + +python start_api.py + +``````powershell## Installation + +Backend runs at: **http://localhost:8000** + +git clone https://github.com/PlatypusPus/MushroomEmpire.git + +2. **Start the Next.js frontend** (Terminal 2) + +```powershellcd MushroomEmpire```bash + +cd frontend + +npm run dev```pip install -r requirements.txt -```bash -pip install -r requirements.txt ``` -Or install as a package: +Frontend runs at: **http://localhost:3000**``` + + + +3. **Access the application**2. **Install Python dependencies** + + - Frontend UI: http://localhost:3000 + + - Try It Page: http://localhost:3000/try```powershellOr install as a package: + + - API Documentation: http://localhost:8000/docs + + - Health Check: http://localhost:8000/healthpip install -r requirements.txt + + + +---python -m spacy download en_core_web_sm```bash + + + +## π Features```pip install -e . + + + +### π― AI Governance & Bias Detection``` + +- **Fairness Metrics**: Disparate Impact, Statistical Parity, Equal Opportunity + +- **Demographic Analysis**: Group-wise performance evaluation3. **Install frontend dependencies** + +- **Violation Detection**: Automatic flagging with severity levels (HIGH/MEDIUM/LOW) + +- **Model Performance**: Comprehensive ML metrics (accuracy, precision, recall, F1)```powershell## Quick Start + + + +### π‘οΈ Privacy Risk Assessmentcd frontend/nordic-privacy-ai + +- **Privacy Risks**: PII detection, GDPR compliance scoring, data exposure analysis + +- **Ethical Risks**: Fairness, transparency, accountability evaluationnpm install```python + +- **Compliance Risks**: Regulatory adherence (GDPR, CCPA, AI Act) + +- **Data Quality**: Missing data, class imbalance, outlier detectioncd ../..from ai_governance import AIGovernanceAnalyzer + + + +### π§Ή Automated Data Cleaning``` + +- **PII Detection**: Email, phone, SSN, credit cards, IP addresses, and more + +- **GPU Acceleration**: CUDA-enabled for 10x faster processing# Initialize analyzer + +- **GDPR Compliance**: Automatic anonymization with audit trails + +- **Smart Anonymization**: Context-aware masking and pseudonymization### Running the Applicationanalyzer = AIGovernanceAnalyzer() + + + +### π Modern Web Interface + +- **Drag & Drop Upload**: Intuitive CSV file handling + +- **Real-time Processing**: Live feedback and progress tracking1. **Start the FastAPI backend** (Terminal 1)# Run complete analysis + +- **Interactive Dashboards**: Visualize bias metrics, risk scores, and results + +- **Report Downloads**: JSON reports, cleaned CSV, and audit logs```powershellreport = analyzer.analyze( + + + +---python start_api.py data_path='your_data.csv', + + + +## ποΈ Project Structure``` target_column='target', + + + +```Backend runs at: **http://localhost:8000** protected_attributes=['gender', 'age', 'race'] + +MushroomEmpire/ + +βββ api/ # FastAPI Backend) + +β βββ main.py # Application entry point + +β βββ routers/2. **Start the Next.js frontend** (Terminal 2) + +β β βββ analyze.py # POST /api/analyze - AI Governance + +β β βββ clean.py # POST /api/clean - Data Cleaning```powershell# Access results + +β βββ utils/ # Helper utilities + +βcd frontend/nordic-privacy-aiprint(f"Bias Score: {report['summary']['overall_bias_score']:.3f}") + +βββ ai_governance/ # Core AI Governance Module + +β βββ __init__.py # AIGovernanceAnalyzer classnpm run devprint(f"Risk Level: {report['summary']['risk_level']}") + +β βββ data_processor.py # Data preprocessing + +β βββ model_trainer.py # ML model training```print(f"Model Accuracy: {report['summary']['model_accuracy']:.3f}") + +β βββ bias_analyzer.py # Bias detection engine + +β βββ risk_analyzer.py # Risk assessment engineFrontend runs at: **http://localhost:3000** + +β βββ report_generator.py # JSON report generation + +β# Save report + +βββ data_cleaning/ # Data Cleaning Module + +β βββ __init__.py # DataCleaner class3. **Access the application**analyzer.save_report(report, 'governance_report.json') + +β βββ cleaner.py # PII detection & anonymization + +β βββ config.py # PII patterns & GDPR rules - Frontend UI: http://localhost:3000``` + +β + +βββ frontend/ # Next.js Frontend - API Documentation: http://localhost:8000/docs + +β βββ app/ # App Router pages + +β β βββ page.tsx # Landing page - Health Check: http://localhost:8000/health## Module Structure + +β β βββ try/page.tsx # Try it page (workflow UI) + +β βββ components/ + +β β βββ try/ + +β β βββ CenterPanel.tsx # File upload & results---``` + +β β βββ Sidebar.tsx # Workflow tabs + +β β βββ ChatbotPanel.tsx # AI assistantai_governance/ + +β βββ lib/ + +β βββ api.ts # TypeScript API client## π Featuresβββ __init__.py # Main API + +β βββ indexeddb.ts # Browser caching utilities + +ββββ data_processor.py # Data preprocessing + +βββ Datasets/ # Sample datasets + +β βββ loan_data.csv # Example: Loan approval dataset### π― AI Governance & Bias Detectionβββ model_trainer.py # ML model training + +β + +βββ reports/ # Generated reports (auto-created)- **Fairness Metrics**: Disparate Impact, Statistical Parity, Equal Opportunityβββ bias_analyzer.py # Bias detection + +β βββ governance_report_*.json + +β βββ cleaned_*.csv- **Demographic Analysis**: Group-wise performance evaluationβββ risk_analyzer.py # Risk assessment + +β βββ cleaning_audit_*.json + +β- **Violation Detection**: Automatic flagging with severity levels (HIGH/MEDIUM/LOW)βββ report_generator.py # Report generation + +βββ start_api.py # Backend startup script + +βββ setup.py # Package configuration- **Model Performance**: Comprehensive ML metrics (accuracy, precision, recall, F1)``` + +βββ requirements.txt # Python dependencies + +βββ README.md # This file -```bash -pip install -e . ``` -## Quick Start +### π‘οΈ Privacy Risk Assessment## API Reference + +--- + +- **Privacy Risks**: PII detection, GDPR compliance scoring, data exposure analysis + +## π‘ API Reference + +- **Ethical Risks**: Fairness, transparency, accountability evaluation### AIGovernanceAnalyzer + +### Base URL + +```- **Compliance Risks**: Regulatory adherence (GDPR, CCPA, AI Act) + +http://localhost:8000 + +```- **Data Quality**: Missing data, class imbalance, outlier detectionMain class for running AI governance analysis. + + + +### Endpoints + + + +#### **POST /api/analyze**### π§Ή Automated Data Cleaning```python + +Analyze dataset for bias, fairness, and risk assessment. + +- **PII Detection**: Email, phone, SSN, credit cards, IP addresses, and moreanalyzer = AIGovernanceAnalyzer() + +**Request:** + +```bash- **GPU Acceleration**: CUDA-enabled for 10x faster processing + +curl -X POST "http://localhost:8000/api/analyze" \ + + -F "file=@Datasets/loan_data.csv"- **GDPR Compliance**: Automatic anonymization with audit trails# Analyze from DataFrame + +``` + +- **Smart Anonymization**: Context-aware masking and pseudonymizationreport = analyzer.analyze_dataframe( + +**Response:** + +```json df=dataframe, + +{ + + "status": "success",### π Modern Web Interface target_column='target', + + "filename": "loan_data.csv", + + "dataset_info": {- **Drag & Drop Upload**: Intuitive CSV file handling protected_attributes=['gender', 'age'] + + "rows": 1000, + + "columns": 15- **Real-time Processing**: Live feedback and progress tracking) + + }, + + "model_performance": {- **Interactive Dashboards**: Visualize bias metrics, risk scores, and results + + "accuracy": 0.85, + + "precision": 0.82,- **Report Downloads**: JSON reports, cleaned CSV, and audit logs# Analyze from file + + "recall": 0.88, + + "f1_score": 0.85report = analyzer.analyze( + + }, + + "bias_metrics": {--- data_path='data.csv', + + "overall_bias_score": 0.23, + + "violations_detected": [] target_column='target', + + }, + + "risk_assessment": {## ποΈ Project Structure protected_attributes=['gender', 'age'] + + "overall_risk_score": 0.35, + + "privacy_risks": [],) + + "ethical_risks": [] + + },`````` + + "recommendations": [ + + "[HIGH] Privacy: Remove PII columns before deployment",MushroomEmpire/ + + "[MEDIUM] Fairness: Monitor demographic parity over time" + + ],βββ api/ # FastAPI Backend### Individual Components + + "report_file": "/reports/governance_report_20251107_123456.json" + +}β βββ main.py # Application entry point + +``` + +β βββ routers/```python + +#### **POST /api/clean** + +Detect and anonymize PII in datasets.β β βββ analyze.py # POST /api/analyze - AI Governancefrom ai_governance import ( + + + +**Request:**β β βββ clean.py # POST /api/clean - Data Cleaning DataProcessor, + +```bash + +curl -X POST "http://localhost:8000/api/clean" \β βββ utils/ # Helper utilities GeneralizedModelTrainer, + + -F "file=@Datasets/loan_data.csv" + +```β BiasAnalyzer, + + + +**Response:**βββ ai_governance/ # Core AI Governance Module RiskAnalyzer, + +```json + +{β βββ __init__.py # AIGovernanceAnalyzer class ReportGenerator + + "status": "success", + + "dataset_info": {β βββ data_processor.py # Data preprocessing) + + "original_rows": 1000, + + "original_columns": 15,β βββ model_trainer.py # ML model training + + "cleaned_rows": 1000, + + "cleaned_columns": 13β βββ bias_analyzer.py # Bias detection engine# Process data + + }, + + "summary": {β βββ risk_analyzer.py # Risk assessment engineprocessor = DataProcessor(df) + + "columns_removed": ["ssn", "email"], + + "columns_anonymized": ["phone", "address"],β βββ report_generator.py # JSON report generationprocessor.target_column = 'target' + + "total_cells_affected": 2847 + + },βprocessor.protected_attributes = ['gender', 'age'] + + "pii_detections": { + + "EMAIL": 1000,βββ cleaning.py # Core PII detection & anonymizationprocessor.prepare_data() + + "PHONE": 987, + + "SSN": 1000βββ cleaning_config.py # Configuration for data cleaning + + }, + + "gdpr_compliance": [βββ test_cleaning.py # Unit tests for cleaning module# Train model + + "Article 5(1)(c) - Data minimization", + + "Article 17 - Right to erasure",βtrainer = GeneralizedModelTrainer( + + "Article 25 - Data protection by design" + + ],βββ frontend/nordic-privacy-ai/ # Next.js Frontend processor.X_train, + + "files": { + + "cleaned_csv": "/reports/cleaned_20251107_123456.csv",β βββ app/ # App Router pages processor.X_test, + + "audit_report": "/reports/cleaning_audit_20251107_123456.json" + + }β β βββ page.tsx # Landing page processor.y_train, + +} + +```β β βββ try/page.tsx # Try it page (workflow UI) processor.y_test, + + + +#### **GET /health**β βββ components/ processor.feature_names + +Health check endpoint with GPU status. + +β β βββ try/) + +**Response:** + +```jsonβ β βββ CenterPanel.tsx # File upload & resultstrainer.train() + +{ + + "status": "healthy",β β βββ Sidebar.tsx # Workflow tabstrainer.evaluate() + + "version": "1.0.0", + + "gpu_available": trueβ β βββ ChatbotPanel.tsx # AI assistant + +} + +```β βββ lib/# Analyze bias + + + +#### **GET /reports/{filename}**β βββ api.ts # TypeScript API clientbias_analyzer = BiasAnalyzer( + +Download generated reports and cleaned files. + +β βββ indexeddb.ts # Browser caching utilities processor.X_test, + +--- + +β processor.y_test, + +## π§ Configuration + +βββ Datasets/ # Sample datasets trainer.y_pred, + +### Environment Variables + +β βββ loan_data.csv # Example: Loan approval dataset processor.df, + +Create `.env` file in `frontend/`: + +```envβ processor.protected_attributes, + +NEXT_PUBLIC_API_URL=http://localhost:8000 + +```βββ reports/ # Generated reports (auto-created) processor.target_column + + + +### CORS Configurationβ βββ governance_report_*.json) + + + +Edit `api/main.py` to add production domains:β βββ cleaned_*.csvbias_results = bias_analyzer.analyze() + +```python + +origins = [β βββ cleaning_audit_*.json + + "http://localhost:3000", + + "https://your-production-domain.com"β# Assess risks + +] + +```βββ start_api.py # Backend startup scriptrisk_analyzer = RiskAnalyzer( + + + +### GPU Accelerationβββ setup.py # Package configuration processor.df, + + + +GPU is automatically detected and used if available. To force CPU mode:βββ requirements.txt # Python dependencies trainer.results, + +```python + +# In data_cleaning/cleaner.py or api endpointsβββ README.md # This file bias_results, + +DataCleaner(use_gpu=False) + +`````` processor.protected_attributes, + + + +--- processor.target_column + + + +## π§ͺ Testing---) + + + +### Test the Backendrisk_results = risk_analyzer.analyze() + +```powershell + +# Test analyze endpoint## π‘ API Reference + +curl -X POST "http://localhost:8000/api/analyze" -F "file=@Datasets/loan_data.csv" + +# Generate report + +# Test clean endpoint + +curl -X POST "http://localhost:8000/api/clean" -F "file=@Datasets/loan_data.csv"### Base URLreport_gen = ReportGenerator( + + + +# Check health``` trainer.results, + +curl http://localhost:8000/health + +```http://localhost:8000 bias_results, + + + +### Run Unit Tests``` risk_results, + +```powershell + +# Test cleaning module processor.df + +python test_cleaning.py + +### Endpoints) + +# Run all tests (if pytest configured) + +pytestreport = report_gen.generate_report() + +``` + +#### **POST /api/analyze**``` + +--- + +Analyze dataset for bias, fairness, and risk assessment. + +## π Usage Examples + +## Report Structure + +### Python SDK Usage + +**Request:** + +```python + +from ai_governance import AIGovernanceAnalyzer```bashThe module generates comprehensive JSON reports: + + + +# Initialize analyzercurl -X POST "http://localhost:8000/api/analyze" \ + +analyzer = AIGovernanceAnalyzer() + + -F "file=@Datasets/loan_data.csv"```json + +# Analyze dataset + +report = analyzer.analyze(```{ + + data_path='Datasets/loan_data.csv', + + target_column='loan_approved', "metadata": { + + protected_attributes=['gender', 'age', 'race'] + +)**Response:** "report_id": "unique_id", + + + +# Print results```json "generated_at": "timestamp", + +print(f"Bias Score: {report['summary']['overall_bias_score']:.3f}") + +print(f"Risk Level: {report['summary']['risk_level']}"){ "dataset_info": {} + +print(f"Model Accuracy: {report['summary']['model_accuracy']:.3f}") + + "status": "success", }, + +# Save report + +analyzer.save_report(report, 'my_report.json') "filename": "loan_data.csv", "summary": { + +``` + + "dataset_info": { "overall_bias_score": 0.0-1.0, + +### Data Cleaning Usage + + "rows": 1000, "overall_risk_score": 0.0-1.0, + +```python + +from data_cleaning import DataCleaner "columns": 15 "risk_level": "LOW|MEDIUM|HIGH", + + + +# Initialize cleaner with GPU }, "model_accuracy": 0.0-1.0, + +cleaner = DataCleaner(use_gpu=True) + + "model_performance": { "fairness_violations_count": 0 + +# Load and clean data + +df = cleaner.load_data('Datasets/loan_data.csv') "accuracy": 0.85, }, + +cleaned_df, audit = cleaner.anonymize_pii(df) + + "precision": 0.82, "model_performance": {}, + +# Save results + +cleaner.save_cleaned_data(cleaned_df, 'cleaned_output.csv') "recall": 0.88, "bias_analysis": {}, + +cleaner.save_audit_report(audit, 'audit_report.json') + +``` "f1_score": 0.85 "risk_assessment": {}, + + + +### Frontend Integration }, "key_findings": [], + + + +```typescript "bias_metrics": { "recommendations": [] + +import { analyzeDataset, cleanDataset } from '@/lib/api'; + + "overall_bias_score": 0.23,} + +// Analyze uploaded file + +const handleAnalyze = async (file: File) => { "violations_detected": []``` + + const result = await analyzeDataset(file); + + console.log('Bias Score:', result.bias_metrics.overall_bias_score); }, + + console.log('Download:', result.report_file); + +}; "risk_assessment": {## Metrics Interpretation + + + +// Clean uploaded file "overall_risk_score": 0.35, + +const handleClean = async (file: File) => { + + const result = await cleanDataset(file); "privacy_risks": [],### Bias Score (0-1, lower is better) + + console.log('Cells anonymized:', result.summary.total_cells_affected); + + console.log('Download cleaned:', result.files.cleaned_csv); "ethical_risks": []- **0.0 - 0.3**: Low bias β + +}; + +``` },- **0.3 - 0.5**: Moderate bias β οΈ + + + +--- "recommendations": [- **0.5 - 1.0**: High bias β + + + +## π Metrics Interpretation "[HIGH] Privacy: Remove PII columns before deployment", + + + +### Bias Score (0-1, lower is better) "[MEDIUM] Fairness: Monitor demographic parity over time"### Risk Score (0-1, lower is better) + +- **0.0 - 0.3**: β Low bias - Good fairness + +- **0.3 - 0.5**: β οΈ Moderate bias - Monitoring recommended ],- **0.0 - 0.4**: LOW risk β + +- **0.5 - 1.0**: β High bias - Immediate action required + + "report_file": "/reports/governance_report_20251107_123456.json"- **0.4 - 0.7**: MEDIUM risk β οΈ + +### Risk Score (0-1, lower is better) + +- **0.0 - 0.4**: β LOW risk}- **0.7 - 1.0**: HIGH risk β + +- **0.4 - 0.7**: β οΈ MEDIUM risk + +- **0.7 - 1.0**: β HIGH risk``` + + + +### Fairness Metrics### Fairness Metrics + +- **Disparate Impact**: Fair range 0.8 - 1.25 + +- **Statistical Parity**: Fair threshold < 0.1#### **POST /api/clean**- **Disparate Impact**: Fair range 0.8 - 1.25 + +- **Equal Opportunity**: Fair threshold < 0.1 + +Detect and anonymize PII in datasets.- **Statistical Parity**: Fair threshold < 0.1 + +--- + +- **Equal Opportunity**: Fair threshold < 0.1 + +## π οΈ Technology Stack + +**Request:** + +### Backend + +- **FastAPI** - Modern Python web framework```bash## Requirements + +- **scikit-learn** - Machine learning + +- **spaCy** - NLP for PII detectioncurl -X POST "http://localhost:8000/api/clean" \ + +- **PyTorch** - GPU acceleration (optional) + +- **pandas** - Data processing -F "file=@Datasets/loan_data.csv"- Python 3.8+ + + + +### Frontend```- pandas >= 2.0.0 + +- **Next.js 14** - React framework with App Router + +- **TypeScript** - Type safety- numpy >= 1.24.0 + +- **Tailwind CSS** - Styling + +- **IndexedDB** - Browser storage**Response:**- scikit-learn >= 1.3.0 + + + +---```json + + + +## π€ Contributing{See `requirements.txt` for complete list. + + + +Contributions are welcome! Please follow these steps: "status": "success", + + + +1. Fork the repository "dataset_info": {## Integration Examples + +2. Create a feature branch (`git checkout -b feature/AmazingFeature`) + +3. Commit your changes (`git commit -m 'Add some AmazingFeature'`) "original_rows": 1000, + +4. Push to the branch (`git push origin feature/AmazingFeature`) + +5. Open a Pull Request "original_columns": 15,### FastAPI Backend + + + +--- "cleaned_rows": 1000, + + + +## π License "cleaned_columns": 13```python + + + +This project is licensed under the MIT License - see the LICENSE file for details. },from fastapi import FastAPI, UploadFile + + + +--- "summary": {from ai_governance import AIGovernanceAnalyzer + + + +## π Citation "columns_removed": ["ssn", "email"], + + + +If you use this project in your research or work, please cite: "columns_anonymized": ["phone", "address"],app = FastAPI() + + + +```bibtex "total_cells_affected": 2847analyzer = AIGovernanceAnalyzer() + +@software{nordic_privacy_ai, + + title = {Nordic Privacy AI - GDPR Compliance & AI Governance Platform}, }, + + author = {PlatypusPus}, + + year = {2025}, "pii_detections": {@app.post("/analyze") + + url = {https://github.com/PlatypusPus/MushroomEmpire} + +} "EMAIL": 1000,async def analyze(file: UploadFile, target: str, protected: list): + +``` + + "PHONE": 987, df = pd.read_csv(file.file) + +--- + + "SSN": 1000 report = analyzer.analyze_dataframe(df, target, protected) + +## π§ Support + + }, return report + +- **Issues**: [GitHub Issues](https://github.com/PlatypusPus/MushroomEmpire/issues) + +- **Discussions**: [GitHub Discussions](https://github.com/PlatypusPus/MushroomEmpire/discussions) "gdpr_compliance": [``` + + + +--- "Article 5(1)(c) - Data minimization", + + + +## π Acknowledgments "Article 17 - Right to erasure",### Flask Backend + + + +- Built for Nordic ecosystems (BankID, MitID, Suomi.fi) "Article 25 - Data protection by design" + +- Inspired by GDPR, CCPA, and EU AI Act requirements + +- Developed during a hackathon prototype ],```python + + + +--- "files": {from flask import Flask, request, jsonify + + + +**Made with β€οΈ by the Nordic Privacy AI Team** "cleaned_csv": "/reports/cleaned_20251107_123456.csv",from ai_governance import AIGovernanceAnalyzer + + + "audit_report": "/reports/cleaning_audit_20251107_123456.json" + + }app = Flask(__name__) + +}analyzer = AIGovernanceAnalyzer() + +``` + +@app.route('/analyze', methods=['POST']) + +#### **GET /health**def analyze(): + +Health check endpoint with GPU status. file = request.files['file'] + + df = pd.read_csv(file) + +**Response:** report = analyzer.analyze_dataframe( + +```json df, + +{ request.form['target'], + + "status": "healthy", request.form.getlist('protected') + + "version": "1.0.0", ) + + "gpu_available": true return jsonify(report) + +}``` + +``` + +## License + +#### **GET /reports/{filename}** + +Download generated reports and cleaned files.MIT License + + + +---## Contributing + + + +## π§ ConfigurationContributions welcome! Please open an issue or submit a pull request. + + + +### Environment Variables## Citation + + + +Create `.env` file in `frontend/nordic-privacy-ai/`:If you use this module in your research or project, please cite: + +```env + +NEXT_PUBLIC_API_URL=http://localhost:8000``` + +```AI Governance Module - Bias Detection and Risk Analysis + +https://github.com/PlatypusPus/MushroomEmpire + +### CORS Configuration``` + + +Edit `api/main.py` to add production domains: +```python +origins = [ + "http://localhost:3000", + "https://your-production-domain.com" +] +``` + +### GPU Acceleration + +GPU is automatically detected and used if available. To force CPU mode: +```python +# In cleaning.py or api endpoints +DataCleaner(use_gpu=False) +``` + +--- + +## π§ͺ Testing + +### Test the Backend +```powershell +# Test analyze endpoint +curl -X POST "http://localhost:8000/api/analyze" -F "file=@Datasets/loan_data.csv" + +# Test clean endpoint +curl -X POST "http://localhost:8000/api/clean" -F "file=@Datasets/loan_data.csv" + +# Check health +curl http://localhost:8000/health +``` + +### Run Unit Tests +```powershell +# Test cleaning module +python test_cleaning.py + +# Run all tests (if pytest configured) +pytest +``` + +--- + +## π Usage Examples + +### Python SDK Usage ```python from ai_governance import AIGovernanceAnalyzer @@ -41,221 +1031,143 @@ from ai_governance import AIGovernanceAnalyzer # Initialize analyzer analyzer = AIGovernanceAnalyzer() -# Run complete analysis +# Analyze dataset report = analyzer.analyze( - data_path='your_data.csv', - target_column='target', + data_path='Datasets/loan_data.csv', + target_column='loan_approved', protected_attributes=['gender', 'age', 'race'] ) -# Access results +# Print results print(f"Bias Score: {report['summary']['overall_bias_score']:.3f}") print(f"Risk Level: {report['summary']['risk_level']}") print(f"Model Accuracy: {report['summary']['model_accuracy']:.3f}") # Save report -analyzer.save_report(report, 'governance_report.json') +analyzer.save_report(report, 'my_report.json') ``` -## Module Structure - -``` -ai_governance/ -βββ __init__.py # Main API -βββ data_processor.py # Data preprocessing -βββ model_trainer.py # ML model training -βββ bias_analyzer.py # Bias detection -βββ risk_analyzer.py # Risk assessment -βββ report_generator.py # Report generation -``` - -## API Reference - -### AIGovernanceAnalyzer - -Main class for running AI governance analysis. +### Data Cleaning Usage ```python -analyzer = AIGovernanceAnalyzer() +from cleaning import DataCleaner -# Analyze from DataFrame -report = analyzer.analyze_dataframe( - df=dataframe, - target_column='target', - protected_attributes=['gender', 'age'] -) +# Initialize cleaner with GPU +cleaner = DataCleaner(use_gpu=True) -# Analyze from file -report = analyzer.analyze( - data_path='data.csv', - target_column='target', - protected_attributes=['gender', 'age'] -) +# Load and clean data +df = cleaner.load_data('Datasets/loan_data.csv') +cleaned_df, audit = cleaner.anonymize_pii(df) + +# Save results +cleaner.save_cleaned_data(cleaned_df, 'cleaned_output.csv') +cleaner.save_audit_report(audit, 'audit_report.json') ``` -### Individual Components +### Frontend Integration -```python -from ai_governance import ( - DataProcessor, - GeneralizedModelTrainer, - BiasAnalyzer, - RiskAnalyzer, - ReportGenerator -) +```typescript +import { analyzeDataset, cleanDataset } from '@/lib/api'; -# Process data -processor = DataProcessor(df) -processor.target_column = 'target' -processor.protected_attributes = ['gender', 'age'] -processor.prepare_data() +// Analyze uploaded file +const handleAnalyze = async (file: File) => { + const result = await analyzeDataset(file); + console.log('Bias Score:', result.bias_metrics.overall_bias_score); + console.log('Download:', result.report_file); +}; -# Train model -trainer = GeneralizedModelTrainer( - processor.X_train, - processor.X_test, - processor.y_train, - processor.y_test, - processor.feature_names -) -trainer.train() -trainer.evaluate() - -# Analyze bias -bias_analyzer = BiasAnalyzer( - processor.X_test, - processor.y_test, - trainer.y_pred, - processor.df, - processor.protected_attributes, - processor.target_column -) -bias_results = bias_analyzer.analyze() - -# Assess risks -risk_analyzer = RiskAnalyzer( - processor.df, - trainer.results, - bias_results, - processor.protected_attributes, - processor.target_column -) -risk_results = risk_analyzer.analyze() - -# Generate report -report_gen = ReportGenerator( - trainer.results, - bias_results, - risk_results, - processor.df -) -report = report_gen.generate_report() +// Clean uploaded file +const handleClean = async (file: File) => { + const result = await cleanDataset(file); + console.log('Cells anonymized:', result.summary.total_cells_affected); + console.log('Download cleaned:', result.files.cleaned_csv); +}; ``` -## Report Structure +--- -The module generates comprehensive JSON reports: - -```json -{ - "metadata": { - "report_id": "unique_id", - "generated_at": "timestamp", - "dataset_info": {} - }, - "summary": { - "overall_bias_score": 0.0-1.0, - "overall_risk_score": 0.0-1.0, - "risk_level": "LOW|MEDIUM|HIGH", - "model_accuracy": 0.0-1.0, - "fairness_violations_count": 0 - }, - "model_performance": {}, - "bias_analysis": {}, - "risk_assessment": {}, - "key_findings": [], - "recommendations": [] -} -``` - -## Metrics Interpretation +## π Metrics Interpretation ### Bias Score (0-1, lower is better) -- **0.0 - 0.3**: Low bias β -- **0.3 - 0.5**: Moderate bias β οΈ -- **0.5 - 1.0**: High bias β +- **0.0 - 0.3**: β Low bias - Good fairness +- **0.3 - 0.5**: β οΈ Moderate bias - Monitoring recommended +- **0.5 - 1.0**: β High bias - Immediate action required ### Risk Score (0-1, lower is better) -- **0.0 - 0.4**: LOW risk β -- **0.4 - 0.7**: MEDIUM risk β οΈ -- **0.7 - 1.0**: HIGH risk β +- **0.0 - 0.4**: β LOW risk +- **0.4 - 0.7**: β οΈ MEDIUM risk +- **0.7 - 1.0**: β HIGH risk ### Fairness Metrics - **Disparate Impact**: Fair range 0.8 - 1.25 - **Statistical Parity**: Fair threshold < 0.1 - **Equal Opportunity**: Fair threshold < 0.1 -## Requirements +--- -- Python 3.8+ -- pandas >= 2.0.0 -- numpy >= 1.24.0 -- scikit-learn >= 1.3.0 +## π οΈ Technology Stack -See `requirements.txt` for complete list. +### Backend +- **FastAPI** - Modern Python web framework +- **scikit-learn** - Machine learning +- **spaCy** - NLP for PII detection +- **PyTorch** - GPU acceleration (optional) +- **pandas** - Data processing -## Integration Examples +### Frontend +- **Next.js 14** - React framework with App Router +- **TypeScript** - Type safety +- **Tailwind CSS** - Styling +- **IndexedDB** - Browser storage -### FastAPI Backend +--- -```python -from fastapi import FastAPI, UploadFile -from ai_governance import AIGovernanceAnalyzer +## π€ Contributing -app = FastAPI() -analyzer = AIGovernanceAnalyzer() +Contributions are welcome! Please follow these steps: -@app.post("/analyze") -async def analyze(file: UploadFile, target: str, protected: list): - df = pd.read_csv(file.file) - report = analyzer.analyze_dataframe(df, target, protected) - return report +1. Fork the repository +2. Create a feature branch (`git checkout -b feature/AmazingFeature`) +3. Commit your changes (`git commit -m 'Add some AmazingFeature'`) +4. Push to the branch (`git push origin feature/AmazingFeature`) +5. Open a Pull Request + +--- + +## π License + +This project is licensed under the MIT License - see the LICENSE file for details. + +--- + +## π Citation + +If you use this project in your research or work, please cite: + +```bibtex +@software{nordic_privacy_ai, + title = {Nordic Privacy AI - GDPR Compliance & AI Governance Platform}, + author = {PlatypusPus}, + year = {2025}, + url = {https://github.com/PlatypusPus/MushroomEmpire} +} ``` -### Flask Backend +--- -```python -from flask import Flask, request, jsonify -from ai_governance import AIGovernanceAnalyzer +## π§ Support -app = Flask(__name__) -analyzer = AIGovernanceAnalyzer() +- **Issues**: [GitHub Issues](https://github.com/PlatypusPus/MushroomEmpire/issues) +- **Discussions**: [GitHub Discussions](https://github.com/PlatypusPus/MushroomEmpire/discussions) -@app.route('/analyze', methods=['POST']) -def analyze(): - file = request.files['file'] - df = pd.read_csv(file) - report = analyzer.analyze_dataframe( - df, - request.form['target'], - request.form.getlist('protected') - ) - return jsonify(report) -``` +--- -## License +## π Acknowledgments -MIT License +- Built for Nordic ecosystems (BankID, MitID, Suomi.fi) +- Inspired by GDPR, CCPA, and EU AI Act requirements +- Developed during a hackathon prototype -## Contributing +--- -Contributions welcome! Please open an issue or submit a pull request. - -## Citation - -If you use this module in your research or project, please cite: - -``` -AI Governance Module - Bias Detection and Risk Analysis -https://github.com/PlatypusPus/MushroomEmpire -``` +**Made with β€οΈ by the Nordic Privacy AI Team** diff --git a/ai_governance/data_processor.py b/ai_governance/data_processor.py index 01c70bb..e7004df 100644 --- a/ai_governance/data_processor.py +++ b/ai_governance/data_processor.py @@ -21,6 +21,7 @@ class DataProcessor: self.categorical_features = [] self.feature_names = [] self.encoders = {} + self.target_encoder = None # Add target encoder self.scaler = StandardScaler() self.X_train = None @@ -75,6 +76,13 @@ class DataProcessor: X = self.df[feature_cols].copy() y = self.df[self.target_column].copy() + # Encode target variable if it's categorical + if y.dtype == 'object' or y.dtype.name == 'category': + self.target_encoder = LabelEncoder() + y_encoded = self.target_encoder.fit_transform(y) + y = pd.Series(y_encoded, index=y.index) + print(f"Target '{self.target_column}' encoded: {dict(enumerate(self.target_encoder.classes_))}") + # Encode categorical variables for col in self.categorical_features: if col in X.columns: diff --git a/api/__init__.py b/api/__init__.py new file mode 100644 index 0000000..f88a663 --- /dev/null +++ b/api/__init__.py @@ -0,0 +1 @@ +# Empty __init__.py files for Python package structure diff --git a/api/main.py b/api/main.py new file mode 100644 index 0000000..7904c89 --- /dev/null +++ b/api/main.py @@ -0,0 +1,72 @@ +""" +FastAPI Backend for Nordic Privacy AI +Provides endpoints for AI Governance analysis and data cleaning +""" + +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from fastapi.staticfiles import StaticFiles +import os + +from api.routers import analyze, clean + +# Create FastAPI app +app = FastAPI( + title="Nordic Privacy AI API", + description="AI-powered GDPR compliance, bias detection, and risk analysis", + version="1.0.0" +) + +# CORS configuration for Next.js frontend +app.add_middleware( + CORSMiddleware, + allow_origins=[ + "http://localhost:3000", # Next.js dev server + "http://127.0.0.1:3000", + ], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Mount reports directory for file downloads +reports_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "reports") +os.makedirs(reports_dir, exist_ok=True) +app.mount("/reports", StaticFiles(directory=reports_dir), name="reports") + +# Include routers +app.include_router(analyze.router, prefix="/api", tags=["AI Governance"]) +app.include_router(clean.router, prefix="/api", tags=["Data Cleaning"]) + +@app.get("/") +async def root(): + """Health check endpoint""" + return { + "status": "online", + "service": "Nordic Privacy AI API", + "version": "1.0.0", + "endpoints": { + "analyze": "/api/analyze", + "clean": "/api/clean", + "docs": "/docs" + } + } + +@app.get("/health") +async def health_check(): + """Detailed health check""" + try: + import torch + cuda_available = torch.cuda.is_available() + gpu_name = torch.cuda.get_device_name(0) if cuda_available else None + except: + cuda_available = False + gpu_name = None + + return { + "status": "healthy", + "gpu_acceleration": { + "available": cuda_available, + "device": gpu_name or "CPU" + } + } diff --git a/api/routers/__init__.py b/api/routers/__init__.py new file mode 100644 index 0000000..f88a663 --- /dev/null +++ b/api/routers/__init__.py @@ -0,0 +1 @@ +# Empty __init__.py files for Python package structure diff --git a/api/routers/analyze.py b/api/routers/analyze.py new file mode 100644 index 0000000..f87a5e6 --- /dev/null +++ b/api/routers/analyze.py @@ -0,0 +1,141 @@ +""" +AI Governance Analysis Router +Handles bias detection and risk analysis endpoints +""" + +from fastapi import APIRouter, File, UploadFile, HTTPException +from fastapi.responses import JSONResponse +import pandas as pd +import numpy as np +import io +import os +import json +from datetime import datetime +from typing import Dict, Any + +# Import AI Governance modules +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) +from ai_governance import AIGovernanceAnalyzer + +router = APIRouter() + + +def convert_to_serializable(obj): + """Convert numpy/pandas types to native Python types for JSON serialization""" + if isinstance(obj, (np.integer, np.int64, np.int32)): + return int(obj) + elif isinstance(obj, (np.floating, np.float64, np.float32)): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + elif isinstance(obj, dict): + return {key: convert_to_serializable(value) for key, value in obj.items()} + elif isinstance(obj, list): + return [convert_to_serializable(item) for item in obj] + return obj + +@router.post("/analyze") +async def analyze_dataset(file: UploadFile = File(...)): + """ + Analyze uploaded dataset for bias and risk + + - **file**: CSV file to analyze + + Returns: + - Analysis results (bias metrics, risk assessment) + - Report file path for download + """ + + # Validate file type + if not file.filename.endswith('.csv'): + raise HTTPException(status_code=400, detail="Only CSV files are supported") + + try: + # Read uploaded file + contents = await file.read() + df = pd.read_csv(io.BytesIO(contents)) + + if df.empty: + raise HTTPException(status_code=400, detail="Uploaded file is empty") + + # Initialize AI Governance Analyzer + analyzer = AIGovernanceAnalyzer() + + # Auto-detect target column and protected attributes + # Target: Last column (common convention) or first binary/categorical column + target_column = df.columns[-1] + + # Protected attributes: Common sensitive columns + protected_keywords = ['gender', 'age', 'race', 'sex', 'ethnicity', 'religion', 'nationality'] + protected_attributes = [col for col in df.columns + if any(keyword in col.lower() for keyword in protected_keywords)] + + # If no protected attributes found, use first few categorical columns + if not protected_attributes: + categorical_cols = df.select_dtypes(include=['object', 'category']).columns + protected_attributes = [col for col in categorical_cols if col != target_column][:3] + + print(f"Analyzing dataset: {file.filename} ({len(df)} rows, {len(df.columns)} columns)") + print(f"Target column: {target_column}") + print(f"Protected attributes: {protected_attributes}") + + # Run analysis + report = analyzer.analyze_dataframe(df, target_column, protected_attributes) + + # Generate report filename + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + safe_filename = (file.filename or "dataset").replace('.csv', '') + report_filename = f"governance_report_{safe_filename}_{timestamp}.json" + report_path = os.path.join("reports", report_filename) + + # Save full report to disk + full_report_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(__file__))), + report_path + ) + analyzer.save_report(report, full_report_path) + + # Prepare response with summary + response_data = { + "status": "success", + "filename": file.filename, + "dataset_info": { + "rows": len(df), + "columns": len(df.columns), + "features": list(df.columns) + }, + "model_performance": { + "accuracy": report.get("model_metrics", {}).get("accuracy", 0), + "precision": report.get("model_metrics", {}).get("precision", 0), + "recall": report.get("model_metrics", {}).get("recall", 0), + "f1_score": report.get("model_metrics", {}).get("f1_score", 0) + }, + "bias_metrics": { + "overall_bias_score": report.get("bias_metrics", {}).get("overall_bias_score", 0), + "disparate_impact": report.get("bias_metrics", {}).get("disparate_impact", {}), + "statistical_parity": report.get("bias_metrics", {}).get("statistical_parity_difference", {}), + "violations_detected": report.get("bias_metrics", {}).get("fairness_violations", []) + }, + "risk_assessment": { + "overall_risk_score": report.get("risk_metrics", {}).get("overall_risk_score", 0), + "privacy_risks": report.get("risk_metrics", {}).get("privacy_risks", []), + "ethical_risks": report.get("risk_metrics", {}).get("ethical_risks", []), + "compliance_risks": report.get("risk_metrics", {}).get("compliance_risks", []), + "data_quality_risks": report.get("risk_metrics", {}).get("data_quality_risks", []) + }, + "recommendations": report.get("recommendations", []), + "report_file": f"/{report_path}", + "timestamp": datetime.now().isoformat() + } + + # Convert all numpy/pandas types to native Python types + response_data = convert_to_serializable(response_data) + + return JSONResponse(content=response_data) + + except pd.errors.EmptyDataError: + raise HTTPException(status_code=400, detail="File is empty or invalid CSV format") + except Exception as e: + print(f"Error during analysis: {str(e)}") + raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}") diff --git a/api/routers/clean.py b/api/routers/clean.py new file mode 100644 index 0000000..0a5eb68 --- /dev/null +++ b/api/routers/clean.py @@ -0,0 +1,142 @@ +""" +Data Cleaning Router +Handles PII detection and anonymization endpoints +""" + +from fastapi import APIRouter, File, UploadFile, HTTPException +from fastapi.responses import JSONResponse +import pandas as pd +import numpy as np +import io +import os +from datetime import datetime +from typing import Dict, Any + +# Import cleaning module +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) +from data_cleaning import DataCleaner + +router = APIRouter() + + +def convert_to_serializable(obj): + """Convert numpy/pandas types to native Python types for JSON serialization""" + if isinstance(obj, (np.integer, np.int64, np.int32)): + return int(obj) + elif isinstance(obj, (np.floating, np.float64, np.float32)): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + elif isinstance(obj, dict): + return {key: convert_to_serializable(value) for key, value in obj.items()} + elif isinstance(obj, list): + return [convert_to_serializable(item) for item in obj] + return obj + + +@router.post("/clean") +async def clean_dataset(file: UploadFile = File(...)): + """ + Clean uploaded dataset - detect and anonymize PII + + - **file**: CSV file to clean + + Returns: + - Cleaned dataset statistics + - PII detections and anonymization actions + - Report file path for download + - Cleaned CSV file path for download + """ + + # Validate file type + if not file.filename.endswith('.csv'): + raise HTTPException(status_code=400, detail="Only CSV files are supported") + + try: + # Read uploaded file + contents = await file.read() + df = pd.read_csv(io.BytesIO(contents)) + + if df.empty: + raise HTTPException(status_code=400, detail="Uploaded file is empty") + + # Initialize Data Cleaner (with GPU if available) + print(f"Cleaning dataset: {file.filename} ({len(df)} rows, {len(df.columns)} columns)") + cleaner = DataCleaner(df, use_gpu=True) + + # Run cleaning (non-interactive mode for API) + cleaned_df, audit_report = cleaner.clean( + risky_features=None, # Auto-detect + interactive=False, # No user prompts in API mode + scan_all_cells=True + ) + + # Generate filenames + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + safe_filename = file.filename.replace('.csv', '') + + # Save cleaned CSV + cleaned_csv_filename = f"cleaned_{safe_filename}_{timestamp}.csv" + cleaned_csv_path = os.path.join("reports", cleaned_csv_filename) + full_cleaned_csv_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(__file__))), + cleaned_csv_path + ) + cleaner.save_cleaned_data(cleaned_df, full_cleaned_csv_path) + + # Save audit report + audit_report_filename = f"cleaning_audit_{safe_filename}_{timestamp}.json" + audit_report_path = os.path.join("reports", audit_report_filename) + full_audit_report_path = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(__file__))), + audit_report_path + ) + cleaner.save_audit_report(audit_report, full_audit_report_path) + + # Prepare response + response_data = { + "status": "success", + "filename": file.filename, + "dataset_info": { + "original_rows": int(audit_report["metadata"]["original_rows"]), + "original_columns": int(audit_report["metadata"]["original_columns"]), + "cleaned_rows": int(audit_report["metadata"]["cleaned_rows"]), + "cleaned_columns": int(audit_report["metadata"]["cleaned_columns"]) + }, + "gpu_acceleration": audit_report["metadata"].get("gpu_acceleration", { + "enabled": False, + "device": "CPU" + }), + "summary": { + "columns_removed": audit_report["summary"]["columns_removed"], + "columns_anonymized": audit_report["summary"]["columns_anonymized"], + "total_cells_affected": int(audit_report["summary"]["total_cells_affected"]) + }, + "pii_detections": { + col: { + "action": details["action"], + "entity_types": details["entity_types_found"], + "num_affected_rows": int(details.get("num_affected_rows", 0)), + "examples": details.get("examples", [])[:2] # Show 2 examples + } + for col, details in audit_report["details"].items() + }, + "gdpr_compliance": audit_report["compliance"]["gdpr_articles_applied"], + "files": { + "cleaned_csv": f"/{cleaned_csv_path}", + "audit_report": f"/{audit_report_path}" + }, + "timestamp": datetime.now().isoformat() + } + + # Convert all numpy/pandas types to native Python types + response_data = convert_to_serializable(response_data) + + return JSONResponse(content=response_data) + + except pd.errors.EmptyDataError: + raise HTTPException(status_code=400, detail="File is empty or invalid CSV format") + except Exception as e: + print(f"Error during cleaning: {str(e)}") + raise HTTPException(status_code=500, detail=f"Cleaning failed: {str(e)}") diff --git a/api/utils/__init__.py b/api/utils/__init__.py new file mode 100644 index 0000000..f88a663 --- /dev/null +++ b/api/utils/__init__.py @@ -0,0 +1 @@ +# Empty __init__.py files for Python package structure diff --git a/data_cleaning/__init__.py b/data_cleaning/__init__.py new file mode 100644 index 0000000..c311fd1 --- /dev/null +++ b/data_cleaning/__init__.py @@ -0,0 +1,12 @@ +""" +Data Cleaning Module +Automated PII detection and GDPR-compliant anonymization +""" + +from .cleaner import DataCleaner + +__version__ = '1.0.0' + +__all__ = [ + 'DataCleaner' +] diff --git a/cleaning.py b/data_cleaning/cleaner.py similarity index 100% rename from cleaning.py rename to data_cleaning/cleaner.py diff --git a/cleaning_config.py b/data_cleaning/config.py similarity index 100% rename from cleaning_config.py rename to data_cleaning/config.py diff --git a/discovery/main.py b/discovery/main.py deleted file mode 100644 index 96fa8e7..0000000 --- a/discovery/main.py +++ /dev/null @@ -1,167 +0,0 @@ -import csv -import re -from pathlib import Path -from collections import Counter -from datetime import datetime - -ROOT = Path("../Data/Politics") - -# Try to import spaCy, fall back to basic extraction if not available -try: - import spacy - nlp = spacy.load("en_core_web_sm") - USE_SPACY = True -except: - USE_SPACY = False - -# Regex patterns for deterministic detection -patterns = { - "EMAIL": re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"), - "PHONE": re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{3,4}"), - "UUID": re.compile(r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b"), - "IBAN": re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{1,30}\b"), - "DATE": re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}"), - "URL": re.compile(r"https?://[^\s]+"), - "SSN": re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), -} - -def find_entities(text): - """Extract entities using regex patterns.""" - found = {} - for label, pattern in patterns.items(): - matches = pattern.findall(text) - if matches: - found[label] = list(set(matches))[:5] # Limit to 5 per type - return found - -def extract_with_spacy(text): - """Extract named entities using spaCy.""" - if not USE_SPACY: - return {}, {}, {} - - doc = nlp(text[:10000]) # Limit text length for performance - - persons = [] - orgs = [] - locations = [] - - for ent in doc.ents: - if ent.label_ == "PERSON": - persons.append(ent.text) - elif ent.label_ == "ORG": - orgs.append(ent.text) - elif ent.label_ in ["GPE", "LOC"]: - locations.append(ent.text) - - # Return most common entities - return ( - dict(Counter(persons).most_common(5)), - dict(Counter(orgs).most_common(5)), - dict(Counter(locations).most_common(5)) - ) - -def extract_metadata(text, filename): - """Extract basic metadata from text.""" - metadata = { - "char_count": len(text), - "word_count": len(text.split()), - "line_count": text.count('\n') + 1, - "file_extension": Path(filename).suffix, - } - return metadata - -def detect_content_type(text): - """Heuristic content type detection.""" - text_lower = text.lower() - - # Check for common document types - if any(word in text_lower[:1000] for word in ['dear', 'sincerely', 'regards']): - return "letter" - elif any(word in text_lower[:500] for word in ['article', 'section', 'amendment']): - return "legal" - elif any(word in text_lower[:500] for word in ['press release', 'for immediate release']): - return "press_release" - elif re.search(r'^\s*#', text[:100], re.MULTILINE): - return "markdown" - elif ' 0: - print("\nTo install spaCy for better entity extraction:") - print(" pip install spacy") - print(" python -m spacy download en_core_web_sm") \ No newline at end of file diff --git a/frontend/nordic-privacy-ai/.gitignore b/frontend/.gitignore similarity index 100% rename from frontend/nordic-privacy-ai/.gitignore rename to frontend/.gitignore diff --git a/frontend/nordic-privacy-ai/app/globals.css b/frontend/app/globals.css similarity index 100% rename from frontend/nordic-privacy-ai/app/globals.css rename to frontend/app/globals.css diff --git a/frontend/nordic-privacy-ai/app/layout.tsx b/frontend/app/layout.tsx similarity index 100% rename from frontend/nordic-privacy-ai/app/layout.tsx rename to frontend/app/layout.tsx diff --git a/frontend/nordic-privacy-ai/app/page.tsx b/frontend/app/page.tsx similarity index 100% rename from frontend/nordic-privacy-ai/app/page.tsx rename to frontend/app/page.tsx diff --git a/frontend/nordic-privacy-ai/app/try/page.tsx b/frontend/app/try/page.tsx similarity index 85% rename from frontend/nordic-privacy-ai/app/try/page.tsx rename to frontend/app/try/page.tsx index 66c51f5..bf2e24c 100644 --- a/frontend/nordic-privacy-ai/app/try/page.tsx +++ b/frontend/app/try/page.tsx @@ -14,10 +14,10 @@ export default function TryPage() {
Upload a CSV / JSON / text file. We will later parse, detect PII, and queue analyses.
+Drag & drop a CSV / JSON / TXT here, or click to browse.
+| {h} | + ))} +
|---|
| {c} | + ))} +
+ {fileMeta.contentPreview || "(no preview)"}
+
+ )}
+
+ {error && (
+ Upload and analyze a dataset to see bias metrics.
+ )} +Upload and analyze a dataset to see risk assessment.
+ )} ++ Recommendations will appear here after analysis. +
+ )} ++ Process a dataset to see aggregated results. +
+ )} +