fix:Fixed the custom Tag selection

2026-02-07 22:18:59 +00:00 · 2025-11-08 04:40:50 +05:30
parent 6fd52b9fb1
commit 82527cab81
6 changed files with 1146 additions and 99 deletions
--- a/api/routers/clean.py
+++ b/api/routers/clean.py
@@ -3,14 +3,15 @@ Data Cleaning Router
 Handles PII detection and anonymization endpoints
 """

-from fastapi import APIRouter, File, UploadFile, HTTPException
+from fastapi import APIRouter, File, UploadFile, HTTPException, Form
 from fastapi.responses import JSONResponse
 import pandas as pd
 import numpy as np
 import io
 import os
+import json
 from datetime import datetime
-from typing import Dict, Any
+from typing import Dict, Any, Optional

 # Import cleaning module
 import sys
@@ -36,11 +37,16 @@ def convert_to_serializable(obj):


@router.post("/clean")
-async def clean_dataset(file: UploadFile = File(...)):
+async def clean_dataset(
+    file: UploadFile = File(...),
+    custom_strategies: Optional[str] = Form(None)
+):
    """
    Clean uploaded dataset - detect and anonymize PII
    
    - **file**: CSV file to clean
+    - **custom_strategies**: Optional JSON string mapping column names to strategy choices
+      Format: {"column_name": {"enabled": true, "strategy": "HASHING"}}
    
    Returns:
        - Cleaned dataset statistics
@@ -53,6 +59,15 @@ async def clean_dataset(file: UploadFile = File(...)):
    if not file.filename.endswith('.csv'):
        raise HTTPException(status_code=400, detail="Only CSV files are supported")
    
+    # Parse custom strategies if provided
+    strategy_map = None
+    if custom_strategies:
+        try:
+            strategy_map = json.loads(custom_strategies)
+            print(f"Using custom strategies for {len(strategy_map)} columns")
+        except json.JSONDecodeError:
+            raise HTTPException(status_code=400, detail="Invalid custom_strategies JSON format")
+    
    try:
        # Read uploaded file
        contents = await file.read()
@@ -65,9 +80,24 @@ async def clean_dataset(file: UploadFile = File(...)):
        print(f"Cleaning dataset: {file.filename} ({len(df)} rows, {len(df.columns)} columns)")
        cleaner = DataCleaner(df, use_gpu=True)
        
+        # If custom strategies provided, filter and apply only enabled columns
+        risky_features_to_clean = None
+        if strategy_map:
+            # Only clean columns that are enabled in the strategy map
+            enabled_columns = [col for col, config in strategy_map.items() if config.get('enabled', True)]
+            risky_features_to_clean = enabled_columns
+            print(f"  Using custom strategies for {len(enabled_columns)} enabled columns")
+            
+            # Store the strategy choices for the cleaner to use
+            cleaner.custom_strategy_map = {
+                col: config['strategy'] 
+                for col, config in strategy_map.items() 
+                if config.get('enabled', True)
+            }
+        
        # Run cleaning (non-interactive mode for API)
        cleaned_df, audit_report = cleaner.clean(
-            risky_features=None,  # Auto-detect
+            risky_features=risky_features_to_clean,  # Use custom list if provided, otherwise auto-detect
            interactive=False,    # No user prompts in API mode
            scan_all_cells=True
        )