ref:Cleaned the Structure

2026-02-07 22:18:59 +00:00 · 2025-11-07 14:20:22 +05:30
parent d7dbcff4b6
commit 1b622b6a19
42 changed files with 2164 additions and 806 deletions
--- a/api/routers/init.py
+++ b/api/routers/init.py
@@ -0,0 +1 @@
+# Empty __init__.py files for Python package structure
--- a/api/routers/analyze.py
+++ b/api/routers/analyze.py
@@ -0,0 +1,141 @@
+"""
+AI Governance Analysis Router
+Handles bias detection and risk analysis endpoints
+"""
+
+from fastapi import APIRouter, File, UploadFile, HTTPException
+from fastapi.responses import JSONResponse
+import pandas as pd
+import numpy as np
+import io
+import os
+import json
+from datetime import datetime
+from typing import Dict, Any
+
+# Import AI Governance modules
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+from ai_governance import AIGovernanceAnalyzer
+
+router = APIRouter()
+
+
+def convert_to_serializable(obj):
+    """Convert numpy/pandas types to native Python types for JSON serialization"""
+    if isinstance(obj, (np.integer, np.int64, np.int32)):
+        return int(obj)
+    elif isinstance(obj, (np.floating, np.float64, np.float32)):
+        return float(obj)
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, dict):
+        return {key: convert_to_serializable(value) for key, value in obj.items()}
+    elif isinstance(obj, list):
+        return [convert_to_serializable(item) for item in obj]
+    return obj
+
+@router.post("/analyze")
+async def analyze_dataset(file: UploadFile = File(...)):
+    """
+    Analyze uploaded dataset for bias and risk
+    
+    - **file**: CSV file to analyze
+    
+    Returns:
+        - Analysis results (bias metrics, risk assessment)
+        - Report file path for download
+    """
+    
+    # Validate file type
+    if not file.filename.endswith('.csv'):
+        raise HTTPException(status_code=400, detail="Only CSV files are supported")
+    
+    try:
+        # Read uploaded file
+        contents = await file.read()
+        df = pd.read_csv(io.BytesIO(contents))
+        
+        if df.empty:
+            raise HTTPException(status_code=400, detail="Uploaded file is empty")
+        
+        # Initialize AI Governance Analyzer
+        analyzer = AIGovernanceAnalyzer()
+        
+        # Auto-detect target column and protected attributes
+        # Target: Last column (common convention) or first binary/categorical column
+        target_column = df.columns[-1]
+        
+        # Protected attributes: Common sensitive columns
+        protected_keywords = ['gender', 'age', 'race', 'sex', 'ethnicity', 'religion', 'nationality']
+        protected_attributes = [col for col in df.columns 
+                              if any(keyword in col.lower() for keyword in protected_keywords)]
+        
+        # If no protected attributes found, use first few categorical columns
+        if not protected_attributes:
+            categorical_cols = df.select_dtypes(include=['object', 'category']).columns
+            protected_attributes = [col for col in categorical_cols if col != target_column][:3]
+        
+        print(f"Analyzing dataset: {file.filename} ({len(df)} rows, {len(df.columns)} columns)")
+        print(f"Target column: {target_column}")
+        print(f"Protected attributes: {protected_attributes}")
+        
+        # Run analysis
+        report = analyzer.analyze_dataframe(df, target_column, protected_attributes)
+        
+        # Generate report filename
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        safe_filename = (file.filename or "dataset").replace('.csv', '')
+        report_filename = f"governance_report_{safe_filename}_{timestamp}.json"
+        report_path = os.path.join("reports", report_filename)
+        
+        # Save full report to disk
+        full_report_path = os.path.join(
+            os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
+            report_path
+        )
+        analyzer.save_report(report, full_report_path)
+        
+        # Prepare response with summary
+        response_data = {
+            "status": "success",
+            "filename": file.filename,
+            "dataset_info": {
+                "rows": len(df),
+                "columns": len(df.columns),
+                "features": list(df.columns)
+            },
+            "model_performance": {
+                "accuracy": report.get("model_metrics", {}).get("accuracy", 0),
+                "precision": report.get("model_metrics", {}).get("precision", 0),
+                "recall": report.get("model_metrics", {}).get("recall", 0),
+                "f1_score": report.get("model_metrics", {}).get("f1_score", 0)
+            },
+            "bias_metrics": {
+                "overall_bias_score": report.get("bias_metrics", {}).get("overall_bias_score", 0),
+                "disparate_impact": report.get("bias_metrics", {}).get("disparate_impact", {}),
+                "statistical_parity": report.get("bias_metrics", {}).get("statistical_parity_difference", {}),
+                "violations_detected": report.get("bias_metrics", {}).get("fairness_violations", [])
+            },
+            "risk_assessment": {
+                "overall_risk_score": report.get("risk_metrics", {}).get("overall_risk_score", 0),
+                "privacy_risks": report.get("risk_metrics", {}).get("privacy_risks", []),
+                "ethical_risks": report.get("risk_metrics", {}).get("ethical_risks", []),
+                "compliance_risks": report.get("risk_metrics", {}).get("compliance_risks", []),
+                "data_quality_risks": report.get("risk_metrics", {}).get("data_quality_risks", [])
+            },
+            "recommendations": report.get("recommendations", []),
+            "report_file": f"/{report_path}",
+            "timestamp": datetime.now().isoformat()
+        }
+        
+        # Convert all numpy/pandas types to native Python types
+        response_data = convert_to_serializable(response_data)
+        
+        return JSONResponse(content=response_data)
+        
+    except pd.errors.EmptyDataError:
+        raise HTTPException(status_code=400, detail="File is empty or invalid CSV format")
+    except Exception as e:
+        print(f"Error during analysis: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
--- a/api/routers/clean.py
+++ b/api/routers/clean.py
@@ -0,0 +1,142 @@
+"""
+Data Cleaning Router
+Handles PII detection and anonymization endpoints
+"""
+
+from fastapi import APIRouter, File, UploadFile, HTTPException
+from fastapi.responses import JSONResponse
+import pandas as pd
+import numpy as np
+import io
+import os
+from datetime import datetime
+from typing import Dict, Any
+
+# Import cleaning module
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+from data_cleaning import DataCleaner
+
+router = APIRouter()
+
+
+def convert_to_serializable(obj):
+    """Convert numpy/pandas types to native Python types for JSON serialization"""
+    if isinstance(obj, (np.integer, np.int64, np.int32)):
+        return int(obj)
+    elif isinstance(obj, (np.floating, np.float64, np.float32)):
+        return float(obj)
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, dict):
+        return {key: convert_to_serializable(value) for key, value in obj.items()}
+    elif isinstance(obj, list):
+        return [convert_to_serializable(item) for item in obj]
+    return obj
+
+
+@router.post("/clean")
+async def clean_dataset(file: UploadFile = File(...)):
+    """
+    Clean uploaded dataset - detect and anonymize PII
+    
+    - **file**: CSV file to clean
+    
+    Returns:
+        - Cleaned dataset statistics
+        - PII detections and anonymization actions
+        - Report file path for download
+        - Cleaned CSV file path for download
+    """
+    
+    # Validate file type
+    if not file.filename.endswith('.csv'):
+        raise HTTPException(status_code=400, detail="Only CSV files are supported")
+    
+    try:
+        # Read uploaded file
+        contents = await file.read()
+        df = pd.read_csv(io.BytesIO(contents))
+        
+        if df.empty:
+            raise HTTPException(status_code=400, detail="Uploaded file is empty")
+        
+        # Initialize Data Cleaner (with GPU if available)
+        print(f"Cleaning dataset: {file.filename} ({len(df)} rows, {len(df.columns)} columns)")
+        cleaner = DataCleaner(df, use_gpu=True)
+        
+        # Run cleaning (non-interactive mode for API)
+        cleaned_df, audit_report = cleaner.clean(
+            risky_features=None,  # Auto-detect
+            interactive=False,    # No user prompts in API mode
+            scan_all_cells=True
+        )
+        
+        # Generate filenames
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        safe_filename = file.filename.replace('.csv', '')
+        
+        # Save cleaned CSV
+        cleaned_csv_filename = f"cleaned_{safe_filename}_{timestamp}.csv"
+        cleaned_csv_path = os.path.join("reports", cleaned_csv_filename)
+        full_cleaned_csv_path = os.path.join(
+            os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
+            cleaned_csv_path
+        )
+        cleaner.save_cleaned_data(cleaned_df, full_cleaned_csv_path)
+        
+        # Save audit report
+        audit_report_filename = f"cleaning_audit_{safe_filename}_{timestamp}.json"
+        audit_report_path = os.path.join("reports", audit_report_filename)
+        full_audit_report_path = os.path.join(
+            os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
+            audit_report_path
+        )
+        cleaner.save_audit_report(audit_report, full_audit_report_path)
+        
+        # Prepare response
+        response_data = {
+            "status": "success",
+            "filename": file.filename,
+            "dataset_info": {
+                "original_rows": int(audit_report["metadata"]["original_rows"]),
+                "original_columns": int(audit_report["metadata"]["original_columns"]),
+                "cleaned_rows": int(audit_report["metadata"]["cleaned_rows"]),
+                "cleaned_columns": int(audit_report["metadata"]["cleaned_columns"])
+            },
+            "gpu_acceleration": audit_report["metadata"].get("gpu_acceleration", {
+                "enabled": False,
+                "device": "CPU"
+            }),
+            "summary": {
+                "columns_removed": audit_report["summary"]["columns_removed"],
+                "columns_anonymized": audit_report["summary"]["columns_anonymized"],
+                "total_cells_affected": int(audit_report["summary"]["total_cells_affected"])
+            },
+            "pii_detections": {
+                col: {
+                    "action": details["action"],
+                    "entity_types": details["entity_types_found"],
+                    "num_affected_rows": int(details.get("num_affected_rows", 0)),
+                    "examples": details.get("examples", [])[:2]  # Show 2 examples
+                }
+                for col, details in audit_report["details"].items()
+            },
+            "gdpr_compliance": audit_report["compliance"]["gdpr_articles_applied"],
+            "files": {
+                "cleaned_csv": f"/{cleaned_csv_path}",
+                "audit_report": f"/{audit_report_path}"
+            },
+            "timestamp": datetime.now().isoformat()
+        }
+        
+        # Convert all numpy/pandas types to native Python types
+        response_data = convert_to_serializable(response_data)
+        
+        return JSONResponse(content=response_data)
+        
+    except pd.errors.EmptyDataError:
+        raise HTTPException(status_code=400, detail="File is empty or invalid CSV format")
+    except Exception as e:
+        print(f"Error during cleaning: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Cleaning failed: {str(e)}")
				`@@ -0,0 +1 @@`
				`# Empty __init__.py files for Python package structure`