ref:Cleaned the Structure

This commit is contained in:
2025-11-07 14:20:22 +05:30
parent d7dbcff4b6
commit 1b622b6a19
42 changed files with 2164 additions and 806 deletions

1
api/routers/__init__.py Normal file
View File

@@ -0,0 +1 @@
# Empty __init__.py files for Python package structure

141
api/routers/analyze.py Normal file
View File

@@ -0,0 +1,141 @@
"""
AI Governance Analysis Router
Handles bias detection and risk analysis endpoints
"""
from fastapi import APIRouter, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
import pandas as pd
import numpy as np
import io
import os
import json
from datetime import datetime
from typing import Dict, Any
# Import AI Governance modules
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
from ai_governance import AIGovernanceAnalyzer
router = APIRouter()
def convert_to_serializable(obj):
"""Convert numpy/pandas types to native Python types for JSON serialization"""
if isinstance(obj, (np.integer, np.int64, np.int32)):
return int(obj)
elif isinstance(obj, (np.floating, np.float64, np.float32)):
return float(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
elif isinstance(obj, dict):
return {key: convert_to_serializable(value) for key, value in obj.items()}
elif isinstance(obj, list):
return [convert_to_serializable(item) for item in obj]
return obj
@router.post("/analyze")
async def analyze_dataset(file: UploadFile = File(...)):
"""
Analyze uploaded dataset for bias and risk
- **file**: CSV file to analyze
Returns:
- Analysis results (bias metrics, risk assessment)
- Report file path for download
"""
# Validate file type
if not file.filename.endswith('.csv'):
raise HTTPException(status_code=400, detail="Only CSV files are supported")
try:
# Read uploaded file
contents = await file.read()
df = pd.read_csv(io.BytesIO(contents))
if df.empty:
raise HTTPException(status_code=400, detail="Uploaded file is empty")
# Initialize AI Governance Analyzer
analyzer = AIGovernanceAnalyzer()
# Auto-detect target column and protected attributes
# Target: Last column (common convention) or first binary/categorical column
target_column = df.columns[-1]
# Protected attributes: Common sensitive columns
protected_keywords = ['gender', 'age', 'race', 'sex', 'ethnicity', 'religion', 'nationality']
protected_attributes = [col for col in df.columns
if any(keyword in col.lower() for keyword in protected_keywords)]
# If no protected attributes found, use first few categorical columns
if not protected_attributes:
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
protected_attributes = [col for col in categorical_cols if col != target_column][:3]
print(f"Analyzing dataset: {file.filename} ({len(df)} rows, {len(df.columns)} columns)")
print(f"Target column: {target_column}")
print(f"Protected attributes: {protected_attributes}")
# Run analysis
report = analyzer.analyze_dataframe(df, target_column, protected_attributes)
# Generate report filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
safe_filename = (file.filename or "dataset").replace('.csv', '')
report_filename = f"governance_report_{safe_filename}_{timestamp}.json"
report_path = os.path.join("reports", report_filename)
# Save full report to disk
full_report_path = os.path.join(
os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
report_path
)
analyzer.save_report(report, full_report_path)
# Prepare response with summary
response_data = {
"status": "success",
"filename": file.filename,
"dataset_info": {
"rows": len(df),
"columns": len(df.columns),
"features": list(df.columns)
},
"model_performance": {
"accuracy": report.get("model_metrics", {}).get("accuracy", 0),
"precision": report.get("model_metrics", {}).get("precision", 0),
"recall": report.get("model_metrics", {}).get("recall", 0),
"f1_score": report.get("model_metrics", {}).get("f1_score", 0)
},
"bias_metrics": {
"overall_bias_score": report.get("bias_metrics", {}).get("overall_bias_score", 0),
"disparate_impact": report.get("bias_metrics", {}).get("disparate_impact", {}),
"statistical_parity": report.get("bias_metrics", {}).get("statistical_parity_difference", {}),
"violations_detected": report.get("bias_metrics", {}).get("fairness_violations", [])
},
"risk_assessment": {
"overall_risk_score": report.get("risk_metrics", {}).get("overall_risk_score", 0),
"privacy_risks": report.get("risk_metrics", {}).get("privacy_risks", []),
"ethical_risks": report.get("risk_metrics", {}).get("ethical_risks", []),
"compliance_risks": report.get("risk_metrics", {}).get("compliance_risks", []),
"data_quality_risks": report.get("risk_metrics", {}).get("data_quality_risks", [])
},
"recommendations": report.get("recommendations", []),
"report_file": f"/{report_path}",
"timestamp": datetime.now().isoformat()
}
# Convert all numpy/pandas types to native Python types
response_data = convert_to_serializable(response_data)
return JSONResponse(content=response_data)
except pd.errors.EmptyDataError:
raise HTTPException(status_code=400, detail="File is empty or invalid CSV format")
except Exception as e:
print(f"Error during analysis: {str(e)}")
raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")

142
api/routers/clean.py Normal file
View File

@@ -0,0 +1,142 @@
"""
Data Cleaning Router
Handles PII detection and anonymization endpoints
"""
from fastapi import APIRouter, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
import pandas as pd
import numpy as np
import io
import os
from datetime import datetime
from typing import Dict, Any
# Import cleaning module
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
from data_cleaning import DataCleaner
router = APIRouter()
def convert_to_serializable(obj):
"""Convert numpy/pandas types to native Python types for JSON serialization"""
if isinstance(obj, (np.integer, np.int64, np.int32)):
return int(obj)
elif isinstance(obj, (np.floating, np.float64, np.float32)):
return float(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
elif isinstance(obj, dict):
return {key: convert_to_serializable(value) for key, value in obj.items()}
elif isinstance(obj, list):
return [convert_to_serializable(item) for item in obj]
return obj
@router.post("/clean")
async def clean_dataset(file: UploadFile = File(...)):
"""
Clean uploaded dataset - detect and anonymize PII
- **file**: CSV file to clean
Returns:
- Cleaned dataset statistics
- PII detections and anonymization actions
- Report file path for download
- Cleaned CSV file path for download
"""
# Validate file type
if not file.filename.endswith('.csv'):
raise HTTPException(status_code=400, detail="Only CSV files are supported")
try:
# Read uploaded file
contents = await file.read()
df = pd.read_csv(io.BytesIO(contents))
if df.empty:
raise HTTPException(status_code=400, detail="Uploaded file is empty")
# Initialize Data Cleaner (with GPU if available)
print(f"Cleaning dataset: {file.filename} ({len(df)} rows, {len(df.columns)} columns)")
cleaner = DataCleaner(df, use_gpu=True)
# Run cleaning (non-interactive mode for API)
cleaned_df, audit_report = cleaner.clean(
risky_features=None, # Auto-detect
interactive=False, # No user prompts in API mode
scan_all_cells=True
)
# Generate filenames
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
safe_filename = file.filename.replace('.csv', '')
# Save cleaned CSV
cleaned_csv_filename = f"cleaned_{safe_filename}_{timestamp}.csv"
cleaned_csv_path = os.path.join("reports", cleaned_csv_filename)
full_cleaned_csv_path = os.path.join(
os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
cleaned_csv_path
)
cleaner.save_cleaned_data(cleaned_df, full_cleaned_csv_path)
# Save audit report
audit_report_filename = f"cleaning_audit_{safe_filename}_{timestamp}.json"
audit_report_path = os.path.join("reports", audit_report_filename)
full_audit_report_path = os.path.join(
os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
audit_report_path
)
cleaner.save_audit_report(audit_report, full_audit_report_path)
# Prepare response
response_data = {
"status": "success",
"filename": file.filename,
"dataset_info": {
"original_rows": int(audit_report["metadata"]["original_rows"]),
"original_columns": int(audit_report["metadata"]["original_columns"]),
"cleaned_rows": int(audit_report["metadata"]["cleaned_rows"]),
"cleaned_columns": int(audit_report["metadata"]["cleaned_columns"])
},
"gpu_acceleration": audit_report["metadata"].get("gpu_acceleration", {
"enabled": False,
"device": "CPU"
}),
"summary": {
"columns_removed": audit_report["summary"]["columns_removed"],
"columns_anonymized": audit_report["summary"]["columns_anonymized"],
"total_cells_affected": int(audit_report["summary"]["total_cells_affected"])
},
"pii_detections": {
col: {
"action": details["action"],
"entity_types": details["entity_types_found"],
"num_affected_rows": int(details.get("num_affected_rows", 0)),
"examples": details.get("examples", [])[:2] # Show 2 examples
}
for col, details in audit_report["details"].items()
},
"gdpr_compliance": audit_report["compliance"]["gdpr_articles_applied"],
"files": {
"cleaned_csv": f"/{cleaned_csv_path}",
"audit_report": f"/{audit_report_path}"
},
"timestamp": datetime.now().isoformat()
}
# Convert all numpy/pandas types to native Python types
response_data = convert_to_serializable(response_data)
return JSONResponse(content=response_data)
except pd.errors.EmptyDataError:
raise HTTPException(status_code=400, detail="File is empty or invalid CSV format")
except Exception as e:
print(f"Error during cleaning: {str(e)}")
raise HTTPException(status_code=500, detail=f"Cleaning failed: {str(e)}")