Cleaning.py added

2026-02-07 22:18:59 +00:00 · 2025-11-07 05:19:29 +05:30
parent ccddffb652
commit f9963b0ab3
3 changed files with 1235 additions and 0 deletions
--- a/test_cleaning.py
+++ b/test_cleaning.py
@@ -0,0 +1,268 @@
+"""
+Test script for data cleaning module
+Demonstrates PII detection and anonymization on loan dataset
+"""
+
+import pandas as pd
+import sys
+import os
+
+# Add parent directory to path
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from cleaning import DataCleaner, CleaningConfig
+
+
+def test_basic_cleaning():
+    """Test basic cleaning functionality"""
+    print("\n" + "="*70)
+    print("TEST 1: Basic PII Detection on Loan Dataset")
+    print("="*70)
+    
+    # Load loan data
+    df = pd.read_csv('Datasets/loan_data.csv')
+    print(f"\n✓ Loaded dataset: {len(df)} rows × {len(df.columns)} columns")
+    print(f"  Columns: {list(df.columns)}")
+    
+    # Initialize cleaner
+    cleaner = DataCleaner(df)
+    
+    # Run cleaning in non-interactive mode (auto-apply strategies)
+    print("\n🔍 Running PII detection...")
+    cleaned_df, audit_report = cleaner.clean(
+        risky_features=None,  # Auto-detect all
+        interactive=False,  # Non-interactive for testing
+        scan_all_cells=True
+    )
+    
+    # Display results
+    cleaner.print_audit_summary(audit_report)
+    
+    return cleaned_df, audit_report
+
+
+def test_with_risky_features():
+    """Test cleaning with specific risky features flagged"""
+    print("\n" + "="*70)
+    print("TEST 2: Cleaning with Pre-Flagged Risky Features")
+    print("="*70)
+    
+    # Load loan data
+    df = pd.read_csv('Datasets/loan_data.csv')
+    
+    # Simulate risky features from RiskAnalyzer
+    risky_features = ['person_education', 'loan_intent', 'person_home_ownership']
+    
+    print(f"\n⚠️  Risky features flagged by RiskAnalyzer: {risky_features}")
+    
+    # Initialize cleaner
+    cleaner = DataCleaner(df)
+    
+    # Run cleaning on flagged features only
+    cleaned_df, audit_report = cleaner.clean(
+        risky_features=risky_features,
+        interactive=False,
+        scan_all_cells=False  # Only scan risky columns
+    )
+    
+    # Display results
+    cleaner.print_audit_summary(audit_report)
+    
+    return cleaned_df, audit_report
+
+
+def test_with_synthetic_pii():
+    """Test with synthetic PII data"""
+    print("\n" + "="*70)
+    print("TEST 3: Synthetic PII Detection")
+    print("="*70)
+    
+    # Create test DataFrame with obvious PII
+    test_data = pd.DataFrame({
+        'customer_id': [1, 2, 3, 4, 5],
+        'email': [
+            'john.doe@example.com',
+            'alice.smith@company.org',
+            'bob.jones@email.com',
+            'carol.white@test.net',
+            'dave.brown@sample.com'
+        ],
+        'phone': [
+            '+1-555-123-4567',
+            '555-234-5678',
+            '(555) 345-6789',
+            '555.456.7890',
+            '5555678901'
+        ],
+        'ssn': [
+            '123-45-6789',
+            '234-56-7890',
+            '345-67-8901',
+            '456-78-9012',
+            '567-89-0123'
+        ],
+        'notes': [
+            'Customer called from 192.168.1.1',
+            'Contact via email: test@example.com',
+            'SSN verified: 111-22-3333',
+            'Previous address: 123 Main St, Boston',
+            'Phone backup: 555-999-8888'
+        ],
+        'amount': [1000, 2000, 1500, 3000, 2500]
+    })
+    
+    print(f"\n✓ Created synthetic dataset with PII:")
+    print(test_data.head())
+    
+    # Initialize cleaner
+    cleaner = DataCleaner(test_data)
+    
+    # Run cleaning
+    cleaned_df, audit_report = cleaner.clean(
+        risky_features=None,
+        interactive=False,
+        scan_all_cells=True
+    )
+    
+    print("\n🔒 Cleaned dataset:")
+    print(cleaned_df.head())
+    
+    # Display results
+    cleaner.print_audit_summary(audit_report)
+    
+    # Save outputs
+    os.makedirs('output', exist_ok=True)
+    cleaner.save_cleaned_data(cleaned_df, 'output/synthetic_cleaned.csv')
+    cleaner.save_audit_report(audit_report, 'output/synthetic_audit.json')
+    
+    return cleaned_df, audit_report
+
+
+def test_interactive_mode():
+    """Test interactive mode (requires user input)"""
+    print("\n" + "="*70)
+    print("TEST 4: Interactive Mode (Manual Decisions)")
+    print("="*70)
+    
+    # Create ambiguous test data
+    test_data = pd.DataFrame({
+        'id': [1, 2, 3],
+        'description': [
+            'Customer from Paris contacted us',  # Paris = location or name?
+            'Spoke with Jordan about the account',  # Jordan = location or name?
+            'Meeting scheduled for March 15th'  # Date
+        ],
+        'value': [100, 200, 300]
+    })
+    
+    print(f"\n✓ Created dataset with ambiguous PII:")
+    print(test_data)
+    
+    print("\n⚠️  This test requires user input for ambiguous cases.")
+    print("    You'll be prompted to choose anonymization strategies.")
+    
+    proceed = input("\nProceed with interactive test? (y/n): ").strip().lower()
+    
+    if proceed == 'y':
+        cleaner = DataCleaner(test_data)
+        cleaned_df, audit_report = cleaner.clean(
+            risky_features=None,
+            interactive=True,  # Enable interactive prompts
+            scan_all_cells=True
+        )
+        
+        print("\n🔒 Cleaned dataset:")
+        print(cleaned_df)
+        
+        cleaner.print_audit_summary(audit_report)
+    else:
+        print("  Skipped interactive test.")
+
+
+def demonstrate_integration_with_analysis():
+    """Demonstrate how cleaning integrates with AI governance pipeline"""
+    print("\n" + "="*70)
+    print("INTEGRATION DEMO: Cleaning → Analysis Workflow")
+    print("="*70)
+    
+    # Load data
+    df = pd.read_csv('Datasets/loan_data.csv')
+    
+    print("\n📊 Workflow:")
+    print("  1. Original dataset → Risk Analysis")
+    print("  2. Risk Analysis → Identifies risky features")
+    print("  3. Risky features → Data Cleaning (this step)")
+    print("  4. Cleaned dataset → Re-run Analysis (optional)")
+    
+    # Simulate risky features from analysis
+    simulated_risky_features = ['person_education', 'loan_intent']
+    
+    print(f"\n⚠️  Step 2 Output (simulated): Risky features = {simulated_risky_features}")
+    
+    # Step 3: Clean data
+    print("\n🔒 Step 3: Cleaning risky features...")
+    cleaner = DataCleaner(df)
+    cleaned_df, audit_report = cleaner.clean(
+        risky_features=simulated_risky_features,
+        interactive=False,
+        scan_all_cells=False
+    )
+    
+    # Save both datasets
+    os.makedirs('output', exist_ok=True)
+    df.to_csv('output/loan_data_original.csv', index=False)
+    cleaner.save_cleaned_data(cleaned_df, 'output/loan_data_cleaned.csv')
+    cleaner.save_audit_report(audit_report, 'output/cleaning_audit.json')
+    
+    print("\n💾 Saved files:")
+    print("  - output/loan_data_original.csv (original)")
+    print("  - output/loan_data_cleaned.csv (cleaned)")
+    print("  - output/cleaning_audit.json (audit report)")
+    
+    print("\n📈 Step 4: User can now choose which dataset to analyze:")
+    print("  Option A: Analyze cleaned dataset (privacy-compliant)")
+    print("  Option B: Analyze original dataset (for comparison)")
+    print("  Option C: Analyze both and compare results")
+    
+    cleaner.print_audit_summary(audit_report)
+
+
+def main():
+    """Run all tests"""
+    print("\n" + "="*70)
+    print("🧪 DATA CLEANING MODULE - TEST SUITE")
+    print("="*70)
+    
+    print("\nAvailable tests:")
+    print("  1. Basic PII detection on loan dataset")
+    print("  2. Cleaning with pre-flagged risky features")
+    print("  3. Synthetic PII detection (comprehensive)")
+    print("  4. Interactive mode (requires user input)")
+    print("  5. Integration workflow demonstration")
+    print("  6. Run all non-interactive tests")
+    
+    choice = input("\nSelect test (1-6): ").strip()
+    
+    if choice == '1':
+        test_basic_cleaning()
+    elif choice == '2':
+        test_with_risky_features()
+    elif choice == '3':
+        test_with_synthetic_pii()
+    elif choice == '4':
+        test_interactive_mode()
+    elif choice == '5':
+        demonstrate_integration_with_analysis()
+    elif choice == '6':
+        print("\n🏃 Running all non-interactive tests...\n")
+        test_basic_cleaning()
+        test_with_risky_features()
+        test_with_synthetic_pii()
+        demonstrate_integration_with_analysis()
+        print("\n✅ All tests completed!")
+    else:
+        print("Invalid choice. Run: python test_cleaning.py")
+
+
+if __name__ == '__main__':
+    main()