Files
MushroomEmpire/test_cleaning.py
2025-11-07 14:20:22 +05:30

367 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Test script for data cleaning module
Tests general PII + Nordic-specific PII detection with automatic report generation
"""
import pandas as pd
import sys
import os
# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from data_cleaning import DataCleaner
def test_basic_cleaning():
"""Test basic cleaning functionality"""
print("\n" + "="*70)
print("TEST 1: Basic PII Detection on Loan Dataset")
print("="*70)
# Load loan data
df = pd.read_csv('Datasets/loan_data.csv')
print(f"\n✓ Loaded dataset: {len(df)} rows × {len(df.columns)} columns")
print(f" Columns: {list(df.columns)}")
# Initialize cleaner
cleaner = DataCleaner(df)
# Run cleaning in non-interactive mode (auto-apply strategies)
print("\n🔍 Running PII detection...")
cleaned_df, audit_report = cleaner.clean(
risky_features=None, # Auto-detect all
interactive=False, # Non-interactive for testing
scan_all_cells=True
)
# Display results
cleaner.print_audit_summary(audit_report)
return cleaned_df, audit_report
def test_with_risky_features():
"""Test cleaning with specific risky features flagged"""
print("\n" + "="*70)
print("TEST 2: Cleaning with Pre-Flagged Risky Features")
print("="*70)
# Load loan data
df = pd.read_csv('Datasets/loan_data.csv')
# Simulate risky features from RiskAnalyzer
risky_features = ['person_education', 'loan_intent', 'person_home_ownership']
print(f"\n⚠️ Risky features flagged by RiskAnalyzer: {risky_features}")
# Initialize cleaner
cleaner = DataCleaner(df)
# Run cleaning on flagged features only
cleaned_df, audit_report = cleaner.clean(
risky_features=risky_features,
interactive=False,
scan_all_cells=False # Only scan risky columns
)
# Display results
cleaner.print_audit_summary(audit_report)
return cleaned_df, audit_report
def test_with_synthetic_pii():
"""Test with synthetic general PII data"""
print("\n" + "="*70)
print("TEST 3: General PII Detection (US/International)")
print("="*70)
# Create test DataFrame with obvious PII
test_data = pd.DataFrame({
'customer_id': [1, 2, 3, 4, 5],
'email': [
'john.doe@example.com',
'alice.smith@company.org',
'bob.jones@email.com',
'carol.white@test.net',
'dave.brown@sample.com'
],
'phone': [
'+1-555-123-4567',
'555-234-5678',
'(555) 345-6789',
'555.456.7890',
'5555678901'
],
'ssn': [
'123-45-6789',
'234-56-7890',
'345-67-8901',
'456-78-9012',
'567-89-0123'
],
'notes': [
'Customer called from 192.168.1.1',
'Contact via email: test@example.com',
'SSN verified: 111-22-3333',
'Previous address: 123 Main St, Boston',
'Phone backup: 555-999-8888'
],
'amount': [1000, 2000, 1500, 3000, 2500]
})
print(f"\n✓ Created synthetic dataset with general PII:")
print(test_data.head())
# Initialize cleaner
cleaner = DataCleaner(test_data)
# Run cleaning
cleaned_df, audit_report = cleaner.clean(
risky_features=None,
interactive=False,
scan_all_cells=True
)
print("\n🔒 Cleaned dataset:")
print(cleaned_df.head())
# Display results
cleaner.print_audit_summary(audit_report)
# Save outputs
os.makedirs('output', exist_ok=True)
cleaner.save_cleaned_data(cleaned_df, 'output/general_pii_cleaned.csv')
cleaner.save_audit_report(audit_report, 'output/general_pii_audit.json')
# Generate reports
print("\n📊 Generating explainability reports...")
cleaner.save_simple_report(audit_report, 'output/general_pii_simple_report.json', 'General PII Test Dataset')
cleaner.save_detailed_report(audit_report, 'output/general_pii_detailed_report.json', 'General PII Test Dataset')
return cleaned_df, audit_report
def test_nordic_pii():
"""Test with Nordic-specific PII data (Finnish, Swedish, Norwegian, Danish)"""
print("\n" + "="*70)
print("TEST 4: Nordic PII Detection (FI, SE, NO, DK)")
print("="*70)
# Create Nordic healthcare test dataset
nordic_data = pd.DataFrame({
'patient_id': [1001, 1002, 1003, 1004, 1005],
'name': ['Matti Virtanen', 'Anna Andersson', 'Lars Nilsen', 'Sofie Jensen', 'Björn Eriksson'],
'henkilotunnus': ['010190-123A', '150685-456B', '310795+789C', '220503A234D', '111280-567E'], # Finnish
'personnummer': [None, '850615-4567', None, None, '801211-8901'], # Swedish
'fodselsnummer': [None, None, '010190 12345', None, None], # Norwegian
'cpr_nummer': [None, None, None, '010190-1234', None], # Danish
'email': ['matti.v@example.fi', 'anna.a@healthcare.se', 'lars.n@clinic.no', 'sofie.j@hospital.dk', 'bjorn.e@care.se'],
'phone': ['+358 50 123 4567', '+46 70 234 5678', '+47 91 345 6789', '+45 20 456 7890', '+46 73 567 8901'],
'diagnosis_code': ['J06.9', 'I10', 'E11.9', 'M54.5', 'F41.1'],
'age': [35, 39, 29, 22, 45],
'gender': ['M', 'F', 'M', 'F', 'M'],
'treatment_outcome': ['Recovered', 'Ongoing', 'Recovered', 'Ongoing', 'Recovered']
})
print(f"\n✓ Created Nordic healthcare dataset:")
print(f" - Finnish Henkilötunnus (HETU)")
print(f" - Swedish Personnummer")
print(f" - Norwegian Fødselsnummer")
print(f" - Danish CPR-nummer")
print(f" - Nordic phone numbers (+358, +46, +47, +45)")
print(f" - Nordic email domains (.fi, .se, .no, .dk)")
print()
print(nordic_data.to_string())
# Initialize cleaner (Nordic recognizers loaded automatically)
cleaner = DataCleaner(nordic_data)
# Run cleaning
cleaned_df, audit_report = cleaner.clean(
risky_features=None,
interactive=False,
scan_all_cells=True
)
print("\n🔒 Cleaned dataset (Nordic IDs anonymized):")
print(cleaned_df.to_string())
# Display results
cleaner.print_audit_summary(audit_report)
# Save outputs
os.makedirs('output', exist_ok=True)
cleaner.save_cleaned_data(cleaned_df, 'output/nordic_pii_cleaned.csv')
cleaner.save_audit_report(audit_report, 'output/nordic_pii_audit.json')
# Generate reports
print("\n📊 Generating explainability reports...")
cleaner.save_simple_report(audit_report, 'output/nordic_pii_simple_report.json', 'Nordic Healthcare Dataset')
cleaner.save_detailed_report(audit_report, 'output/nordic_pii_detailed_report.json', 'Nordic Healthcare Dataset')
print("\n✅ Nordic-specific entities detected:")
print(" ✓ FI_PERSONAL_ID (Finnish Henkilötunnus)")
print(" ✓ SE_PERSONAL_ID (Swedish Personnummer)")
print(" ✓ NO_PERSONAL_ID (Norwegian Fødselsnummer)")
print(" ✓ DK_PERSONAL_ID (Danish CPR-nummer)")
return cleaned_df, audit_report
def test_interactive_mode():
"""Test interactive mode (requires user input)"""
print("\n" + "="*70)
print("TEST 5: Interactive Mode (Manual Decisions)")
print("="*70)
# Create ambiguous test data
test_data = pd.DataFrame({
'id': [1, 2, 3],
'description': [
'Customer from Paris contacted us', # Paris = location or name?
'Spoke with Jordan about the account', # Jordan = location or name?
'Meeting scheduled for March 15th' # Date
],
'value': [100, 200, 300]
})
print(f"\n✓ Created dataset with ambiguous PII:")
print(test_data)
print("\n⚠️ This test requires user input for ambiguous cases.")
print(" You'll be prompted to choose anonymization strategies.")
proceed = input("\nProceed with interactive test? (y/n): ").strip().lower()
if proceed == 'y':
cleaner = DataCleaner(test_data)
cleaned_df, audit_report = cleaner.clean(
risky_features=None,
interactive=True, # Enable interactive prompts
scan_all_cells=True
)
print("\n🔒 Cleaned dataset:")
print(cleaned_df)
cleaner.print_audit_summary(audit_report)
else:
print(" Skipped interactive test.")
def demonstrate_integration_with_analysis():
"""Demonstrate how cleaning integrates with AI governance pipeline"""
print("\n" + "="*70)
print("TEST 6: Integration Demo (Cleaning → Analysis Workflow)")
print("="*70)
# Load data
df = pd.read_csv('Datasets/loan_data.csv')
print("\n📊 Workflow:")
print(" 1. Original dataset → Risk Analysis")
print(" 2. Risk Analysis → Identifies risky features")
print(" 3. Risky features → Data Cleaning (this step)")
print(" 4. Cleaned dataset → Re-run Analysis (optional)")
# Simulate risky features from analysis
simulated_risky_features = ['person_education', 'loan_intent']
print(f"\n⚠️ Step 2 Output (simulated): Risky features = {simulated_risky_features}")
# Step 3: Clean data
print("\n🔒 Step 3: Cleaning risky features...")
cleaner = DataCleaner(df)
cleaned_df, audit_report = cleaner.clean(
risky_features=simulated_risky_features,
interactive=False,
scan_all_cells=False
)
# Save both datasets
os.makedirs('output', exist_ok=True)
df.to_csv('output/loan_data_original.csv', index=False)
cleaner.save_cleaned_data(cleaned_df, 'output/loan_data_cleaned.csv')
cleaner.save_audit_report(audit_report, 'output/cleaning_audit.json')
print("\n💾 Saved files:")
print(" - output/loan_data_original.csv (original)")
print(" - output/loan_data_cleaned.csv (cleaned)")
print(" - output/cleaning_audit.json (audit report)")
print("\n📈 Step 4: User can now choose which dataset to analyze:")
print(" Option A: Analyze cleaned dataset (privacy-compliant)")
print(" Option B: Analyze original dataset (for comparison)")
print(" Option C: Analyze both and compare results")
cleaner.print_audit_summary(audit_report)
def main():
"""Run all tests"""
print("\n" + "="*70)
print("🧪 DATA CLEANING MODULE - TEST SUITE")
print(" General PII + Nordic-Specific PII Detection")
print("="*70)
print("\nAvailable tests:")
print(" 1. Basic PII detection on loan dataset")
print(" 2. Cleaning with pre-flagged risky features")
print(" 3. General PII detection (US/International) + Reports")
print(" 4. Nordic PII detection (FI, SE, NO, DK) + Reports")
print(" 5. Interactive mode (requires user input)")
print(" 6. Integration workflow demonstration")
print(" 7. Run all non-interactive tests")
print(" 8. Run Nordic + General PII tests only")
choice = input("\nSelect test (1-8): ").strip()
if choice == '1':
test_basic_cleaning()
elif choice == '2':
test_with_risky_features()
elif choice == '3':
test_with_synthetic_pii()
elif choice == '4':
test_nordic_pii()
elif choice == '5':
test_interactive_mode()
elif choice == '6':
demonstrate_integration_with_analysis()
elif choice == '7':
print("\n🏃 Running all non-interactive tests...\n")
test_basic_cleaning()
test_with_risky_features()
test_with_synthetic_pii()
test_nordic_pii()
demonstrate_integration_with_analysis()
print("\n✅ All tests completed!")
elif choice == '8':
print("\n🏃 Running PII detection tests with report generation...\n")
test_with_synthetic_pii()
test_nordic_pii()
print("\n" + "="*70)
print("✅ PII TESTS COMPLETED!")
print("="*70)
print("\n📂 Generated files in output/:")
print(" General PII:")
print(" - general_pii_cleaned.csv")
print(" - general_pii_audit.json")
print(" - general_pii_simple_report.json")
print(" - general_pii_detailed_report.json")
print("\n Nordic PII:")
print(" - nordic_pii_cleaned.csv")
print(" - nordic_pii_audit.json")
print(" - nordic_pii_simple_report.json")
print(" - nordic_pii_detailed_report.json")
print("\n💡 Review the simple reports for executive summaries")
print("💡 Review the detailed reports for compliance documentation")
else:
print("Invalid choice. Run: python test_cleaning.py")
if __name__ == '__main__':
main()