ref:Cleaned the Structure

2026-02-07 22:18:59 +00:00 · 2025-11-07 14:20:22 +05:30
parent d7dbcff4b6
commit 1b622b6a19
42 changed files with 2164 additions and 806 deletions
--- a/data_cleaning/config.py
+++ b/data_cleaning/config.py
@@ -0,0 +1,245 @@
+"""
+Cleaning Configuration
+Customize PII detection and anonymization strategies
+"""
+
+# Anonymization Strategy Definitions
+STRATEGIES = {
+    'REMOVE': {
+        'description': 'Delete entire column',
+        'risk_level': 'HIGH',
+        'reversible': False,
+        'use_cases': ['Credit cards', 'SSN', 'Bank accounts']
+    },
+    'HASH': {
+        'description': 'One-way SHA-256 hash',
+        'risk_level': 'MEDIUM',
+        'reversible': False,
+        'use_cases': ['Emails', 'Phone numbers', 'Names']
+    },
+    'MASK': {
+        'description': 'Replace with asterisks',
+        'risk_level': 'LOW',
+        'reversible': False,
+        'use_cases': ['Partial identifiers', 'Locations']
+    },
+    'GENERALIZE': {
+        'description': 'Reduce precision',
+        'risk_level': 'LOW',
+        'reversible': False,
+        'use_cases': ['Dates', 'Ages', 'ZIP codes']
+    },
+    'KEEP': {
+        'description': 'No changes',
+        'risk_level': 'NONE',
+        'reversible': True,
+        'use_cases': ['Non-sensitive data']
+    }
+}
+
+# Entity Type to Strategy Mapping
+# Customize these based on your compliance requirements
+ENTITY_STRATEGY_MAP = {
+    # Financial Identifiers - HIGHEST RISK
+    'CREDIT_CARD': 'REMOVE',
+    'CRYPTO': 'REMOVE',
+    'IBAN_CODE': 'REMOVE',
+    'US_BANK_NUMBER': 'REMOVE',
+    
+    # Government IDs - HIGH RISK
+    'US_SSN': 'REMOVE',
+    'US_DRIVER_LICENSE': 'REMOVE',
+    'US_PASSPORT': 'REMOVE',
+    'US_ITIN': 'REMOVE',
+    'UK_NHS': 'REMOVE',
+    'SG_NRIC_FIN': 'REMOVE',
+    'IN_PAN': 'REMOVE',
+    
+    # Nordic National IDs - HIGH RISK (CRITICAL)
+    'FI_PERSONAL_ID': 'REMOVE',  # Finnish Henkilötunnus (HETU)
+    'SE_PERSONAL_ID': 'REMOVE',  # Swedish Personnummer
+    'NO_PERSONAL_ID': 'REMOVE',  # Norwegian Fødselsnummer
+    'DK_PERSONAL_ID': 'REMOVE',  # Danish CPR-nummer
+    'FI_KELA_ID': 'REMOVE',      # Finnish social security (Kela)
+    
+    # Health Information - HIGH RISK (GDPR Art. 9)
+    'MEDICAL_LICENSE': 'REMOVE',
+    
+    # Contact Information - MEDIUM RISK
+    'EMAIL_ADDRESS': 'HASH',
+    'PHONE_NUMBER': 'HASH',
+    'URL': 'HASH',
+    
+    # Personal Identifiers - MEDIUM RISK
+    'PERSON': 'HASH',  # Names
+    'IP_ADDRESS': 'HASH',
+    
+    # Nordic Business Identifiers - MEDIUM RISK
+    'FI_BUSINESS_ID': 'HASH',  # Finnish Y-tunnus (less sensitive than personal IDs)
+    
+    # Geographic Information - LOW RISK
+    'LOCATION': 'MASK',
+    'US_ZIP_CODE': 'GENERALIZE',
+    
+    # Temporal Information - LOW RISK
+    'DATE_TIME': 'GENERALIZE',
+    
+    # Special Categories - MEDIUM RISK (GDPR Art. 9)
+    'NRP': 'HASH',  # Nationality, religious, political views
+    
+    # Business Identifiers - LOW RISK
+    'AU_ABN': 'HASH',
+    'AU_ACN': 'HASH',
+    'AU_TFN': 'HASH',
+}
+
+# Confidence Score Thresholds
+CONFIDENCE_THRESHOLDS = {
+    'HIGH': 0.85,      # Auto-apply strategy
+    'MEDIUM': 0.60,    # Prompt user in interactive mode
+    'LOW': 0.40,       # Treat as potential false positive
+}
+
+# GDPR Article Mappings
+GDPR_COMPLIANCE = {
+    'CREDIT_CARD': 'Art. 4(1) - Personal data identifier',
+    'US_SSN': 'Art. 4(1) - Personal data identifier',
+    'US_BANK_NUMBER': 'Art. 4(1) - Personal data identifier',
+    'EMAIL_ADDRESS': 'Art. 4(1) - Personal data identifier',
+    'PHONE_NUMBER': 'Art. 4(1) - Personal data identifier',
+    'PERSON': 'Art. 4(1) - Personal data (name)',
+    'LOCATION': 'Art. 4(1) - Personal data (location)',
+    'IP_ADDRESS': 'Art. 4(1) - Online identifier',
+    'MEDICAL_LICENSE': 'Art. 9(1) - Special category data (health)',
+    'NRP': 'Art. 9(1) - Special category data (political/religious views)',
+    'DATE_TIME': 'Art. 4(1) - Personal data (temporal information)',
+    
+    # Nordic National IDs
+    'FI_PERSONAL_ID': 'Art. 4(1) - Personal data identifier + Recital 26',
+    'SE_PERSONAL_ID': 'Art. 4(1) - Personal data identifier + Recital 26',
+    'NO_PERSONAL_ID': 'Art. 4(1) - Personal data identifier + Recital 26',
+    'DK_PERSONAL_ID': 'Art. 4(1) - Personal data identifier + Recital 26',
+    'FI_KELA_ID': 'Art. 9(1) - Special category (health/social security)',
+    'FI_BUSINESS_ID': 'Art. 4(1) - Organizational identifier (lower risk)',
+}
+
+# Presidio Analyzer Settings
+PRESIDIO_CONFIG = {
+    'language': 'en',
+    'score_threshold': 0.5,  # Minimum confidence to report
+    'entities': None,  # None = detect all, or specify list like ['EMAIL_ADDRESS', 'PHONE_NUMBER']
+    'allow_list': [],  # Terms to ignore (e.g., company names that look like PII)
+}
+
+# Custom Recognizers (domain-specific patterns)
+# Add patterns specific to your industry/use case
+CUSTOM_PATTERNS = {
+    'LOAN_ID': {
+        'pattern': r'LN\d{8}',
+        'score': 0.9,
+        'strategy': 'HASH'
+    },
+    'EMPLOYEE_ID': {
+        'pattern': r'EMP\d{6}',
+        'score': 0.9,
+        'strategy': 'HASH'
+    },
+    'ACCOUNT_NUMBER': {
+        'pattern': r'ACC\d{10}',
+        'score': 0.95,
+        'strategy': 'REMOVE'
+    }
+}
+
+# Column Name Heuristics
+# Auto-flag columns based on name patterns
+RISKY_COLUMN_PATTERNS = [
+    r'.*email.*',
+    r'.*phone.*',
+    r'.*ssn.*',
+    r'.*social.*security.*',
+    r'.*credit.*card.*',
+    r'.*passport.*',
+    r'.*license.*',
+    r'.*address.*',
+    r'.*ip.*addr.*',
+]
+
+# Protected Attributes Configuration
+# These are needed for bias analysis but may contain PII
+PROTECTED_ATTRIBUTES_HANDLING = {
+    'default_strategy': 'KEEP',  # Keep for bias analysis
+    'warn_user': True,  # Warn about privacy implications
+    'alternative': 'Use generalization (e.g., age_group instead of exact age)'
+}
+
+# Audit Report Settings
+AUDIT_CONFIG = {
+    'include_examples': True,
+    'max_examples_per_column': 3,
+    'truncate_values': 50,  # Max characters to show in examples
+    'include_presidio_metrics': True,
+    'include_gdpr_references': True,
+    'include_recommendations': True
+}
+
+# Performance Settings
+PERFORMANCE_CONFIG = {
+    'sample_size_for_detection': 1000,  # Max rows to analyze per column
+    'batch_size': 100,  # Rows to process per batch
+    'enable_parallel': False,  # Future: parallel column processing
+}
+
+# Output Settings
+OUTPUT_CONFIG = {
+    'cleaned_suffix': '_cleaned',
+    'audit_suffix': '_audit',
+    'format': 'csv',  # Future: support parquet, json
+    'compression': None,  # Future: gzip, bz2
+}
+
+
+def get_strategy_for_entity(entity_type: str) -> str:
+    """
+    Get anonymization strategy for an entity type
+    
+    Args:
+        entity_type: Presidio entity type (e.g., 'EMAIL_ADDRESS')
+        
+    Returns:
+        Strategy name (e.g., 'HASH')
+    """
+    return ENTITY_STRATEGY_MAP.get(entity_type, 'HASH')  # Default to HASH if unknown
+
+
+def get_risk_level(strategy: str) -> str:
+    """
+    Get risk level for a strategy
+    
+    Args:
+        strategy: Strategy name (e.g., 'HASH')
+        
+    Returns:
+        Risk level (e.g., 'MEDIUM')
+    """
+    return STRATEGIES.get(strategy, {}).get('risk_level', 'UNKNOWN')
+
+
+def is_high_confidence(score: float) -> bool:
+    """Check if confidence score is high enough for auto-processing"""
+    return score >= CONFIDENCE_THRESHOLDS['HIGH']
+
+
+def is_medium_confidence(score: float) -> bool:
+    """Check if confidence score requires user confirmation"""
+    return CONFIDENCE_THRESHOLDS['MEDIUM'] <= score < CONFIDENCE_THRESHOLDS['HIGH']
+
+
+def is_low_confidence(score: float) -> bool:
+    """Check if confidence score might be false positive"""
+    return score < CONFIDENCE_THRESHOLDS['MEDIUM']
+
+
+# Example usage in cleaning.py:
+# from cleaning_config import ENTITY_STRATEGY_MAP, get_strategy_for_entity
+# strategy = get_strategy_for_entity('EMAIL_ADDRESS')  # Returns 'HASH'