mirror of
https://github.com/PlatypusPus/MushroomEmpire.git
synced 2026-02-07 22:18:59 +00:00
ref:Cleaned the Structure
This commit is contained in:
1
api/routers/__init__.py
Normal file
1
api/routers/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Empty __init__.py files for Python package structure
|
||||
141
api/routers/analyze.py
Normal file
141
api/routers/analyze.py
Normal file
@@ -0,0 +1,141 @@
|
||||
"""
|
||||
AI Governance Analysis Router
|
||||
Handles bias detection and risk analysis endpoints
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, File, UploadFile, HTTPException
|
||||
from fastapi.responses import JSONResponse
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import io
|
||||
import os
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any
|
||||
|
||||
# Import AI Governance modules
|
||||
import sys
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
||||
from ai_governance import AIGovernanceAnalyzer
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def convert_to_serializable(obj):
|
||||
"""Convert numpy/pandas types to native Python types for JSON serialization"""
|
||||
if isinstance(obj, (np.integer, np.int64, np.int32)):
|
||||
return int(obj)
|
||||
elif isinstance(obj, (np.floating, np.float64, np.float32)):
|
||||
return float(obj)
|
||||
elif isinstance(obj, np.ndarray):
|
||||
return obj.tolist()
|
||||
elif isinstance(obj, dict):
|
||||
return {key: convert_to_serializable(value) for key, value in obj.items()}
|
||||
elif isinstance(obj, list):
|
||||
return [convert_to_serializable(item) for item in obj]
|
||||
return obj
|
||||
|
||||
@router.post("/analyze")
|
||||
async def analyze_dataset(file: UploadFile = File(...)):
|
||||
"""
|
||||
Analyze uploaded dataset for bias and risk
|
||||
|
||||
- **file**: CSV file to analyze
|
||||
|
||||
Returns:
|
||||
- Analysis results (bias metrics, risk assessment)
|
||||
- Report file path for download
|
||||
"""
|
||||
|
||||
# Validate file type
|
||||
if not file.filename.endswith('.csv'):
|
||||
raise HTTPException(status_code=400, detail="Only CSV files are supported")
|
||||
|
||||
try:
|
||||
# Read uploaded file
|
||||
contents = await file.read()
|
||||
df = pd.read_csv(io.BytesIO(contents))
|
||||
|
||||
if df.empty:
|
||||
raise HTTPException(status_code=400, detail="Uploaded file is empty")
|
||||
|
||||
# Initialize AI Governance Analyzer
|
||||
analyzer = AIGovernanceAnalyzer()
|
||||
|
||||
# Auto-detect target column and protected attributes
|
||||
# Target: Last column (common convention) or first binary/categorical column
|
||||
target_column = df.columns[-1]
|
||||
|
||||
# Protected attributes: Common sensitive columns
|
||||
protected_keywords = ['gender', 'age', 'race', 'sex', 'ethnicity', 'religion', 'nationality']
|
||||
protected_attributes = [col for col in df.columns
|
||||
if any(keyword in col.lower() for keyword in protected_keywords)]
|
||||
|
||||
# If no protected attributes found, use first few categorical columns
|
||||
if not protected_attributes:
|
||||
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
||||
protected_attributes = [col for col in categorical_cols if col != target_column][:3]
|
||||
|
||||
print(f"Analyzing dataset: {file.filename} ({len(df)} rows, {len(df.columns)} columns)")
|
||||
print(f"Target column: {target_column}")
|
||||
print(f"Protected attributes: {protected_attributes}")
|
||||
|
||||
# Run analysis
|
||||
report = analyzer.analyze_dataframe(df, target_column, protected_attributes)
|
||||
|
||||
# Generate report filename
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
safe_filename = (file.filename or "dataset").replace('.csv', '')
|
||||
report_filename = f"governance_report_{safe_filename}_{timestamp}.json"
|
||||
report_path = os.path.join("reports", report_filename)
|
||||
|
||||
# Save full report to disk
|
||||
full_report_path = os.path.join(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
|
||||
report_path
|
||||
)
|
||||
analyzer.save_report(report, full_report_path)
|
||||
|
||||
# Prepare response with summary
|
||||
response_data = {
|
||||
"status": "success",
|
||||
"filename": file.filename,
|
||||
"dataset_info": {
|
||||
"rows": len(df),
|
||||
"columns": len(df.columns),
|
||||
"features": list(df.columns)
|
||||
},
|
||||
"model_performance": {
|
||||
"accuracy": report.get("model_metrics", {}).get("accuracy", 0),
|
||||
"precision": report.get("model_metrics", {}).get("precision", 0),
|
||||
"recall": report.get("model_metrics", {}).get("recall", 0),
|
||||
"f1_score": report.get("model_metrics", {}).get("f1_score", 0)
|
||||
},
|
||||
"bias_metrics": {
|
||||
"overall_bias_score": report.get("bias_metrics", {}).get("overall_bias_score", 0),
|
||||
"disparate_impact": report.get("bias_metrics", {}).get("disparate_impact", {}),
|
||||
"statistical_parity": report.get("bias_metrics", {}).get("statistical_parity_difference", {}),
|
||||
"violations_detected": report.get("bias_metrics", {}).get("fairness_violations", [])
|
||||
},
|
||||
"risk_assessment": {
|
||||
"overall_risk_score": report.get("risk_metrics", {}).get("overall_risk_score", 0),
|
||||
"privacy_risks": report.get("risk_metrics", {}).get("privacy_risks", []),
|
||||
"ethical_risks": report.get("risk_metrics", {}).get("ethical_risks", []),
|
||||
"compliance_risks": report.get("risk_metrics", {}).get("compliance_risks", []),
|
||||
"data_quality_risks": report.get("risk_metrics", {}).get("data_quality_risks", [])
|
||||
},
|
||||
"recommendations": report.get("recommendations", []),
|
||||
"report_file": f"/{report_path}",
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Convert all numpy/pandas types to native Python types
|
||||
response_data = convert_to_serializable(response_data)
|
||||
|
||||
return JSONResponse(content=response_data)
|
||||
|
||||
except pd.errors.EmptyDataError:
|
||||
raise HTTPException(status_code=400, detail="File is empty or invalid CSV format")
|
||||
except Exception as e:
|
||||
print(f"Error during analysis: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
|
||||
142
api/routers/clean.py
Normal file
142
api/routers/clean.py
Normal file
@@ -0,0 +1,142 @@
|
||||
"""
|
||||
Data Cleaning Router
|
||||
Handles PII detection and anonymization endpoints
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, File, UploadFile, HTTPException
|
||||
from fastapi.responses import JSONResponse
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import io
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any
|
||||
|
||||
# Import cleaning module
|
||||
import sys
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
|
||||
from data_cleaning import DataCleaner
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def convert_to_serializable(obj):
|
||||
"""Convert numpy/pandas types to native Python types for JSON serialization"""
|
||||
if isinstance(obj, (np.integer, np.int64, np.int32)):
|
||||
return int(obj)
|
||||
elif isinstance(obj, (np.floating, np.float64, np.float32)):
|
||||
return float(obj)
|
||||
elif isinstance(obj, np.ndarray):
|
||||
return obj.tolist()
|
||||
elif isinstance(obj, dict):
|
||||
return {key: convert_to_serializable(value) for key, value in obj.items()}
|
||||
elif isinstance(obj, list):
|
||||
return [convert_to_serializable(item) for item in obj]
|
||||
return obj
|
||||
|
||||
|
||||
@router.post("/clean")
|
||||
async def clean_dataset(file: UploadFile = File(...)):
|
||||
"""
|
||||
Clean uploaded dataset - detect and anonymize PII
|
||||
|
||||
- **file**: CSV file to clean
|
||||
|
||||
Returns:
|
||||
- Cleaned dataset statistics
|
||||
- PII detections and anonymization actions
|
||||
- Report file path for download
|
||||
- Cleaned CSV file path for download
|
||||
"""
|
||||
|
||||
# Validate file type
|
||||
if not file.filename.endswith('.csv'):
|
||||
raise HTTPException(status_code=400, detail="Only CSV files are supported")
|
||||
|
||||
try:
|
||||
# Read uploaded file
|
||||
contents = await file.read()
|
||||
df = pd.read_csv(io.BytesIO(contents))
|
||||
|
||||
if df.empty:
|
||||
raise HTTPException(status_code=400, detail="Uploaded file is empty")
|
||||
|
||||
# Initialize Data Cleaner (with GPU if available)
|
||||
print(f"Cleaning dataset: {file.filename} ({len(df)} rows, {len(df.columns)} columns)")
|
||||
cleaner = DataCleaner(df, use_gpu=True)
|
||||
|
||||
# Run cleaning (non-interactive mode for API)
|
||||
cleaned_df, audit_report = cleaner.clean(
|
||||
risky_features=None, # Auto-detect
|
||||
interactive=False, # No user prompts in API mode
|
||||
scan_all_cells=True
|
||||
)
|
||||
|
||||
# Generate filenames
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
safe_filename = file.filename.replace('.csv', '')
|
||||
|
||||
# Save cleaned CSV
|
||||
cleaned_csv_filename = f"cleaned_{safe_filename}_{timestamp}.csv"
|
||||
cleaned_csv_path = os.path.join("reports", cleaned_csv_filename)
|
||||
full_cleaned_csv_path = os.path.join(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
|
||||
cleaned_csv_path
|
||||
)
|
||||
cleaner.save_cleaned_data(cleaned_df, full_cleaned_csv_path)
|
||||
|
||||
# Save audit report
|
||||
audit_report_filename = f"cleaning_audit_{safe_filename}_{timestamp}.json"
|
||||
audit_report_path = os.path.join("reports", audit_report_filename)
|
||||
full_audit_report_path = os.path.join(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
|
||||
audit_report_path
|
||||
)
|
||||
cleaner.save_audit_report(audit_report, full_audit_report_path)
|
||||
|
||||
# Prepare response
|
||||
response_data = {
|
||||
"status": "success",
|
||||
"filename": file.filename,
|
||||
"dataset_info": {
|
||||
"original_rows": int(audit_report["metadata"]["original_rows"]),
|
||||
"original_columns": int(audit_report["metadata"]["original_columns"]),
|
||||
"cleaned_rows": int(audit_report["metadata"]["cleaned_rows"]),
|
||||
"cleaned_columns": int(audit_report["metadata"]["cleaned_columns"])
|
||||
},
|
||||
"gpu_acceleration": audit_report["metadata"].get("gpu_acceleration", {
|
||||
"enabled": False,
|
||||
"device": "CPU"
|
||||
}),
|
||||
"summary": {
|
||||
"columns_removed": audit_report["summary"]["columns_removed"],
|
||||
"columns_anonymized": audit_report["summary"]["columns_anonymized"],
|
||||
"total_cells_affected": int(audit_report["summary"]["total_cells_affected"])
|
||||
},
|
||||
"pii_detections": {
|
||||
col: {
|
||||
"action": details["action"],
|
||||
"entity_types": details["entity_types_found"],
|
||||
"num_affected_rows": int(details.get("num_affected_rows", 0)),
|
||||
"examples": details.get("examples", [])[:2] # Show 2 examples
|
||||
}
|
||||
for col, details in audit_report["details"].items()
|
||||
},
|
||||
"gdpr_compliance": audit_report["compliance"]["gdpr_articles_applied"],
|
||||
"files": {
|
||||
"cleaned_csv": f"/{cleaned_csv_path}",
|
||||
"audit_report": f"/{audit_report_path}"
|
||||
},
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Convert all numpy/pandas types to native Python types
|
||||
response_data = convert_to_serializable(response_data)
|
||||
|
||||
return JSONResponse(content=response_data)
|
||||
|
||||
except pd.errors.EmptyDataError:
|
||||
raise HTTPException(status_code=400, detail="File is empty or invalid CSV format")
|
||||
except Exception as e:
|
||||
print(f"Error during cleaning: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"Cleaning failed: {str(e)}")
|
||||
Reference in New Issue
Block a user