mirror of
https://github.com/PlatypusPus/MushroomEmpire.git
synced 2026-02-07 22:18:59 +00:00
367 lines
13 KiB
Python
367 lines
13 KiB
Python
"""
|
||
Test script for data cleaning module
|
||
Tests general PII + Nordic-specific PII detection with automatic report generation
|
||
"""
|
||
|
||
import pandas as pd
|
||
import sys
|
||
import os
|
||
|
||
# Add parent directory to path
|
||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||
|
||
from data_cleaning import DataCleaner
|
||
|
||
|
||
def test_basic_cleaning():
|
||
"""Test basic cleaning functionality"""
|
||
print("\n" + "="*70)
|
||
print("TEST 1: Basic PII Detection on Loan Dataset")
|
||
print("="*70)
|
||
|
||
# Load loan data
|
||
df = pd.read_csv('Datasets/loan_data.csv')
|
||
print(f"\n✓ Loaded dataset: {len(df)} rows × {len(df.columns)} columns")
|
||
print(f" Columns: {list(df.columns)}")
|
||
|
||
# Initialize cleaner
|
||
cleaner = DataCleaner(df)
|
||
|
||
# Run cleaning in non-interactive mode (auto-apply strategies)
|
||
print("\n🔍 Running PII detection...")
|
||
cleaned_df, audit_report = cleaner.clean(
|
||
risky_features=None, # Auto-detect all
|
||
interactive=False, # Non-interactive for testing
|
||
scan_all_cells=True
|
||
)
|
||
|
||
# Display results
|
||
cleaner.print_audit_summary(audit_report)
|
||
|
||
return cleaned_df, audit_report
|
||
|
||
|
||
def test_with_risky_features():
|
||
"""Test cleaning with specific risky features flagged"""
|
||
print("\n" + "="*70)
|
||
print("TEST 2: Cleaning with Pre-Flagged Risky Features")
|
||
print("="*70)
|
||
|
||
# Load loan data
|
||
df = pd.read_csv('Datasets/loan_data.csv')
|
||
|
||
# Simulate risky features from RiskAnalyzer
|
||
risky_features = ['person_education', 'loan_intent', 'person_home_ownership']
|
||
|
||
print(f"\n⚠️ Risky features flagged by RiskAnalyzer: {risky_features}")
|
||
|
||
# Initialize cleaner
|
||
cleaner = DataCleaner(df)
|
||
|
||
# Run cleaning on flagged features only
|
||
cleaned_df, audit_report = cleaner.clean(
|
||
risky_features=risky_features,
|
||
interactive=False,
|
||
scan_all_cells=False # Only scan risky columns
|
||
)
|
||
|
||
# Display results
|
||
cleaner.print_audit_summary(audit_report)
|
||
|
||
return cleaned_df, audit_report
|
||
|
||
|
||
def test_with_synthetic_pii():
|
||
"""Test with synthetic general PII data"""
|
||
print("\n" + "="*70)
|
||
print("TEST 3: General PII Detection (US/International)")
|
||
print("="*70)
|
||
|
||
# Create test DataFrame with obvious PII
|
||
test_data = pd.DataFrame({
|
||
'customer_id': [1, 2, 3, 4, 5],
|
||
'email': [
|
||
'john.doe@example.com',
|
||
'alice.smith@company.org',
|
||
'bob.jones@email.com',
|
||
'carol.white@test.net',
|
||
'dave.brown@sample.com'
|
||
],
|
||
'phone': [
|
||
'+1-555-123-4567',
|
||
'555-234-5678',
|
||
'(555) 345-6789',
|
||
'555.456.7890',
|
||
'5555678901'
|
||
],
|
||
'ssn': [
|
||
'123-45-6789',
|
||
'234-56-7890',
|
||
'345-67-8901',
|
||
'456-78-9012',
|
||
'567-89-0123'
|
||
],
|
||
'notes': [
|
||
'Customer called from 192.168.1.1',
|
||
'Contact via email: test@example.com',
|
||
'SSN verified: 111-22-3333',
|
||
'Previous address: 123 Main St, Boston',
|
||
'Phone backup: 555-999-8888'
|
||
],
|
||
'amount': [1000, 2000, 1500, 3000, 2500]
|
||
})
|
||
|
||
print(f"\n✓ Created synthetic dataset with general PII:")
|
||
print(test_data.head())
|
||
|
||
# Initialize cleaner
|
||
cleaner = DataCleaner(test_data)
|
||
|
||
# Run cleaning
|
||
cleaned_df, audit_report = cleaner.clean(
|
||
risky_features=None,
|
||
interactive=False,
|
||
scan_all_cells=True
|
||
)
|
||
|
||
print("\n🔒 Cleaned dataset:")
|
||
print(cleaned_df.head())
|
||
|
||
# Display results
|
||
cleaner.print_audit_summary(audit_report)
|
||
|
||
# Save outputs
|
||
os.makedirs('output', exist_ok=True)
|
||
cleaner.save_cleaned_data(cleaned_df, 'output/general_pii_cleaned.csv')
|
||
cleaner.save_audit_report(audit_report, 'output/general_pii_audit.json')
|
||
|
||
# Generate reports
|
||
print("\n📊 Generating explainability reports...")
|
||
cleaner.save_simple_report(audit_report, 'output/general_pii_simple_report.json', 'General PII Test Dataset')
|
||
cleaner.save_detailed_report(audit_report, 'output/general_pii_detailed_report.json', 'General PII Test Dataset')
|
||
|
||
return cleaned_df, audit_report
|
||
|
||
|
||
def test_nordic_pii():
|
||
"""Test with Nordic-specific PII data (Finnish, Swedish, Norwegian, Danish)"""
|
||
print("\n" + "="*70)
|
||
print("TEST 4: Nordic PII Detection (FI, SE, NO, DK)")
|
||
print("="*70)
|
||
|
||
# Create Nordic healthcare test dataset
|
||
nordic_data = pd.DataFrame({
|
||
'patient_id': [1001, 1002, 1003, 1004, 1005],
|
||
'name': ['Matti Virtanen', 'Anna Andersson', 'Lars Nilsen', 'Sofie Jensen', 'Björn Eriksson'],
|
||
'henkilotunnus': ['010190-123A', '150685-456B', '310795+789C', '220503A234D', '111280-567E'], # Finnish
|
||
'personnummer': [None, '850615-4567', None, None, '801211-8901'], # Swedish
|
||
'fodselsnummer': [None, None, '010190 12345', None, None], # Norwegian
|
||
'cpr_nummer': [None, None, None, '010190-1234', None], # Danish
|
||
'email': ['matti.v@example.fi', 'anna.a@healthcare.se', 'lars.n@clinic.no', 'sofie.j@hospital.dk', 'bjorn.e@care.se'],
|
||
'phone': ['+358 50 123 4567', '+46 70 234 5678', '+47 91 345 6789', '+45 20 456 7890', '+46 73 567 8901'],
|
||
'diagnosis_code': ['J06.9', 'I10', 'E11.9', 'M54.5', 'F41.1'],
|
||
'age': [35, 39, 29, 22, 45],
|
||
'gender': ['M', 'F', 'M', 'F', 'M'],
|
||
'treatment_outcome': ['Recovered', 'Ongoing', 'Recovered', 'Ongoing', 'Recovered']
|
||
})
|
||
|
||
print(f"\n✓ Created Nordic healthcare dataset:")
|
||
print(f" - Finnish Henkilötunnus (HETU)")
|
||
print(f" - Swedish Personnummer")
|
||
print(f" - Norwegian Fødselsnummer")
|
||
print(f" - Danish CPR-nummer")
|
||
print(f" - Nordic phone numbers (+358, +46, +47, +45)")
|
||
print(f" - Nordic email domains (.fi, .se, .no, .dk)")
|
||
print()
|
||
print(nordic_data.to_string())
|
||
|
||
# Initialize cleaner (Nordic recognizers loaded automatically)
|
||
cleaner = DataCleaner(nordic_data)
|
||
|
||
# Run cleaning
|
||
cleaned_df, audit_report = cleaner.clean(
|
||
risky_features=None,
|
||
interactive=False,
|
||
scan_all_cells=True
|
||
)
|
||
|
||
print("\n🔒 Cleaned dataset (Nordic IDs anonymized):")
|
||
print(cleaned_df.to_string())
|
||
|
||
# Display results
|
||
cleaner.print_audit_summary(audit_report)
|
||
|
||
# Save outputs
|
||
os.makedirs('output', exist_ok=True)
|
||
cleaner.save_cleaned_data(cleaned_df, 'output/nordic_pii_cleaned.csv')
|
||
cleaner.save_audit_report(audit_report, 'output/nordic_pii_audit.json')
|
||
|
||
# Generate reports
|
||
print("\n📊 Generating explainability reports...")
|
||
cleaner.save_simple_report(audit_report, 'output/nordic_pii_simple_report.json', 'Nordic Healthcare Dataset')
|
||
cleaner.save_detailed_report(audit_report, 'output/nordic_pii_detailed_report.json', 'Nordic Healthcare Dataset')
|
||
|
||
print("\n✅ Nordic-specific entities detected:")
|
||
print(" ✓ FI_PERSONAL_ID (Finnish Henkilötunnus)")
|
||
print(" ✓ SE_PERSONAL_ID (Swedish Personnummer)")
|
||
print(" ✓ NO_PERSONAL_ID (Norwegian Fødselsnummer)")
|
||
print(" ✓ DK_PERSONAL_ID (Danish CPR-nummer)")
|
||
|
||
return cleaned_df, audit_report
|
||
|
||
|
||
def test_interactive_mode():
|
||
"""Test interactive mode (requires user input)"""
|
||
print("\n" + "="*70)
|
||
print("TEST 5: Interactive Mode (Manual Decisions)")
|
||
print("="*70)
|
||
|
||
# Create ambiguous test data
|
||
test_data = pd.DataFrame({
|
||
'id': [1, 2, 3],
|
||
'description': [
|
||
'Customer from Paris contacted us', # Paris = location or name?
|
||
'Spoke with Jordan about the account', # Jordan = location or name?
|
||
'Meeting scheduled for March 15th' # Date
|
||
],
|
||
'value': [100, 200, 300]
|
||
})
|
||
|
||
print(f"\n✓ Created dataset with ambiguous PII:")
|
||
print(test_data)
|
||
|
||
print("\n⚠️ This test requires user input for ambiguous cases.")
|
||
print(" You'll be prompted to choose anonymization strategies.")
|
||
|
||
proceed = input("\nProceed with interactive test? (y/n): ").strip().lower()
|
||
|
||
if proceed == 'y':
|
||
cleaner = DataCleaner(test_data)
|
||
cleaned_df, audit_report = cleaner.clean(
|
||
risky_features=None,
|
||
interactive=True, # Enable interactive prompts
|
||
scan_all_cells=True
|
||
)
|
||
|
||
print("\n🔒 Cleaned dataset:")
|
||
print(cleaned_df)
|
||
|
||
cleaner.print_audit_summary(audit_report)
|
||
else:
|
||
print(" Skipped interactive test.")
|
||
|
||
|
||
def demonstrate_integration_with_analysis():
|
||
"""Demonstrate how cleaning integrates with AI governance pipeline"""
|
||
print("\n" + "="*70)
|
||
print("TEST 6: Integration Demo (Cleaning → Analysis Workflow)")
|
||
print("="*70)
|
||
|
||
# Load data
|
||
df = pd.read_csv('Datasets/loan_data.csv')
|
||
|
||
print("\n📊 Workflow:")
|
||
print(" 1. Original dataset → Risk Analysis")
|
||
print(" 2. Risk Analysis → Identifies risky features")
|
||
print(" 3. Risky features → Data Cleaning (this step)")
|
||
print(" 4. Cleaned dataset → Re-run Analysis (optional)")
|
||
|
||
# Simulate risky features from analysis
|
||
simulated_risky_features = ['person_education', 'loan_intent']
|
||
|
||
print(f"\n⚠️ Step 2 Output (simulated): Risky features = {simulated_risky_features}")
|
||
|
||
# Step 3: Clean data
|
||
print("\n🔒 Step 3: Cleaning risky features...")
|
||
cleaner = DataCleaner(df)
|
||
cleaned_df, audit_report = cleaner.clean(
|
||
risky_features=simulated_risky_features,
|
||
interactive=False,
|
||
scan_all_cells=False
|
||
)
|
||
|
||
# Save both datasets
|
||
os.makedirs('output', exist_ok=True)
|
||
df.to_csv('output/loan_data_original.csv', index=False)
|
||
cleaner.save_cleaned_data(cleaned_df, 'output/loan_data_cleaned.csv')
|
||
cleaner.save_audit_report(audit_report, 'output/cleaning_audit.json')
|
||
|
||
print("\n💾 Saved files:")
|
||
print(" - output/loan_data_original.csv (original)")
|
||
print(" - output/loan_data_cleaned.csv (cleaned)")
|
||
print(" - output/cleaning_audit.json (audit report)")
|
||
|
||
print("\n📈 Step 4: User can now choose which dataset to analyze:")
|
||
print(" Option A: Analyze cleaned dataset (privacy-compliant)")
|
||
print(" Option B: Analyze original dataset (for comparison)")
|
||
print(" Option C: Analyze both and compare results")
|
||
|
||
cleaner.print_audit_summary(audit_report)
|
||
|
||
|
||
def main():
|
||
"""Run all tests"""
|
||
print("\n" + "="*70)
|
||
print("🧪 DATA CLEANING MODULE - TEST SUITE")
|
||
print(" General PII + Nordic-Specific PII Detection")
|
||
print("="*70)
|
||
|
||
print("\nAvailable tests:")
|
||
print(" 1. Basic PII detection on loan dataset")
|
||
print(" 2. Cleaning with pre-flagged risky features")
|
||
print(" 3. General PII detection (US/International) + Reports")
|
||
print(" 4. Nordic PII detection (FI, SE, NO, DK) + Reports")
|
||
print(" 5. Interactive mode (requires user input)")
|
||
print(" 6. Integration workflow demonstration")
|
||
print(" 7. Run all non-interactive tests")
|
||
print(" 8. Run Nordic + General PII tests only")
|
||
|
||
choice = input("\nSelect test (1-8): ").strip()
|
||
|
||
if choice == '1':
|
||
test_basic_cleaning()
|
||
elif choice == '2':
|
||
test_with_risky_features()
|
||
elif choice == '3':
|
||
test_with_synthetic_pii()
|
||
elif choice == '4':
|
||
test_nordic_pii()
|
||
elif choice == '5':
|
||
test_interactive_mode()
|
||
elif choice == '6':
|
||
demonstrate_integration_with_analysis()
|
||
elif choice == '7':
|
||
print("\n🏃 Running all non-interactive tests...\n")
|
||
test_basic_cleaning()
|
||
test_with_risky_features()
|
||
test_with_synthetic_pii()
|
||
test_nordic_pii()
|
||
demonstrate_integration_with_analysis()
|
||
print("\n✅ All tests completed!")
|
||
elif choice == '8':
|
||
print("\n🏃 Running PII detection tests with report generation...\n")
|
||
test_with_synthetic_pii()
|
||
test_nordic_pii()
|
||
print("\n" + "="*70)
|
||
print("✅ PII TESTS COMPLETED!")
|
||
print("="*70)
|
||
print("\n📂 Generated files in output/:")
|
||
print(" General PII:")
|
||
print(" - general_pii_cleaned.csv")
|
||
print(" - general_pii_audit.json")
|
||
print(" - general_pii_simple_report.json")
|
||
print(" - general_pii_detailed_report.json")
|
||
print("\n Nordic PII:")
|
||
print(" - nordic_pii_cleaned.csv")
|
||
print(" - nordic_pii_audit.json")
|
||
print(" - nordic_pii_simple_report.json")
|
||
print(" - nordic_pii_detailed_report.json")
|
||
print("\n💡 Review the simple reports for executive summaries")
|
||
print("💡 Review the detailed reports for compliance documentation")
|
||
else:
|
||
print("Invalid choice. Run: python test_cleaning.py")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|