mirror of
https://github.com/PlatypusPus/MushroomEmpire.git
synced 2026-02-07 22:18:59 +00:00
300 lines
10 KiB
Python
300 lines
10 KiB
Python
"""
|
|
Cleaning Configuration
|
|
Customize PII detection and anonymization strategies
|
|
"""
|
|
|
|
# Anonymization Strategy Definitions
|
|
STRATEGIES = {
|
|
'REMOVE': {
|
|
'description': 'Delete entire column',
|
|
'risk_level': 'HIGH',
|
|
'reversible': False,
|
|
'use_cases': ['Credit cards', 'SSN', 'Bank accounts']
|
|
},
|
|
'HASH': {
|
|
'description': 'One-way SHA-256 hash',
|
|
'risk_level': 'MEDIUM',
|
|
'reversible': False,
|
|
'use_cases': ['Emails', 'Phone numbers', 'Names']
|
|
},
|
|
'MASK': {
|
|
'description': 'Replace with asterisks',
|
|
'risk_level': 'LOW',
|
|
'reversible': False,
|
|
'use_cases': ['Partial identifiers', 'Locations']
|
|
},
|
|
'GENERALIZE': {
|
|
'description': 'Reduce precision',
|
|
'risk_level': 'LOW',
|
|
'reversible': False,
|
|
'use_cases': ['Dates', 'Ages', 'ZIP codes']
|
|
},
|
|
'KEEP': {
|
|
'description': 'No changes',
|
|
'risk_level': 'NONE',
|
|
'reversible': True,
|
|
'use_cases': ['Non-sensitive data']
|
|
}
|
|
}
|
|
|
|
# Entity Type to Strategy Mapping
|
|
# Customize these based on your compliance requirements
|
|
ENTITY_STRATEGY_MAP = {
|
|
# Financial Identifiers - HIGHEST RISK
|
|
'CREDIT_CARD': 'REMOVE',
|
|
'CRYPTO': 'REMOVE',
|
|
'IBAN_CODE': 'REMOVE',
|
|
'US_BANK_NUMBER': 'REMOVE',
|
|
|
|
# Government IDs - HIGH RISK
|
|
'US_SSN': 'REMOVE',
|
|
'US_DRIVER_LICENSE': 'REMOVE',
|
|
'US_PASSPORT': 'REMOVE',
|
|
'US_ITIN': 'REMOVE',
|
|
'UK_NHS': 'REMOVE',
|
|
'SG_NRIC_FIN': 'REMOVE',
|
|
'IN_PAN': 'REMOVE',
|
|
|
|
# Nordic National IDs - HIGH RISK (CRITICAL)
|
|
'FI_PERSONAL_ID': 'REMOVE', # Finnish Henkilötunnus (HETU)
|
|
'SE_PERSONAL_ID': 'REMOVE', # Swedish Personnummer
|
|
'NO_PERSONAL_ID': 'REMOVE', # Norwegian Fødselsnummer
|
|
'DK_PERSONAL_ID': 'REMOVE', # Danish CPR-nummer
|
|
'FI_KELA_ID': 'REMOVE', # Finnish social security (Kela)
|
|
|
|
# Health Information - HIGH RISK (GDPR Art. 9)
|
|
'MEDICAL_LICENSE': 'REMOVE',
|
|
|
|
# Contact Information - MEDIUM RISK
|
|
'EMAIL_ADDRESS': 'HASH',
|
|
'PHONE_NUMBER': 'HASH',
|
|
'URL': 'HASH',
|
|
|
|
# Personal Identifiers - MEDIUM RISK
|
|
'PERSON': 'HASH', # Names
|
|
'IP_ADDRESS': 'HASH',
|
|
|
|
# Nordic Business Identifiers - MEDIUM RISK
|
|
'FI_BUSINESS_ID': 'HASH', # Finnish Y-tunnus (less sensitive than personal IDs)
|
|
|
|
# Geographic Information - LOW RISK
|
|
'LOCATION': 'MASK',
|
|
'US_ZIP_CODE': 'GENERALIZE',
|
|
|
|
# Temporal Information - LOW RISK
|
|
'DATE_TIME': 'GENERALIZE',
|
|
|
|
# Special Categories - MEDIUM RISK (GDPR Art. 9)
|
|
'NRP': 'HASH', # Nationality, religious, political views
|
|
|
|
# Business Identifiers - LOW RISK
|
|
'AU_ABN': 'HASH',
|
|
'AU_ACN': 'HASH',
|
|
'AU_TFN': 'HASH',
|
|
}
|
|
|
|
# Confidence Score Thresholds
|
|
CONFIDENCE_THRESHOLDS = {
|
|
'HIGH': 0.85, # Auto-apply strategy
|
|
'MEDIUM': 0.60, # Prompt user in interactive mode
|
|
'LOW': 0.40, # Treat as potential false positive
|
|
}
|
|
|
|
# GDPR Article Mappings
|
|
GDPR_COMPLIANCE = {
|
|
'CREDIT_CARD': 'Art. 4(1) - Personal data identifier',
|
|
'US_SSN': 'Art. 4(1) - Personal data identifier',
|
|
'US_BANK_NUMBER': 'Art. 4(1) - Personal data identifier',
|
|
'EMAIL_ADDRESS': 'Art. 4(1) - Personal data identifier',
|
|
'PHONE_NUMBER': 'Art. 4(1) - Personal data identifier',
|
|
'PERSON': 'Art. 4(1) - Personal data (name)',
|
|
'LOCATION': 'Art. 4(1) - Personal data (location)',
|
|
'IP_ADDRESS': 'Art. 4(1) - Online identifier',
|
|
'MEDICAL_LICENSE': 'Art. 9(1) - Special category data (health)',
|
|
'NRP': 'Art. 9(1) - Special category data (political/religious views)',
|
|
'DATE_TIME': 'Art. 4(1) - Personal data (temporal information)',
|
|
|
|
# Nordic National IDs
|
|
'FI_PERSONAL_ID': 'Art. 4(1) - Personal data identifier + Recital 26',
|
|
'SE_PERSONAL_ID': 'Art. 4(1) - Personal data identifier + Recital 26',
|
|
'NO_PERSONAL_ID': 'Art. 4(1) - Personal data identifier + Recital 26',
|
|
'DK_PERSONAL_ID': 'Art. 4(1) - Personal data identifier + Recital 26',
|
|
'FI_KELA_ID': 'Art. 9(1) - Special category (health/social security)',
|
|
'FI_BUSINESS_ID': 'Art. 4(1) - Organizational identifier (lower risk)',
|
|
}
|
|
|
|
# Presidio Analyzer Settings
|
|
PRESIDIO_CONFIG = {
|
|
'language': 'en',
|
|
'score_threshold': 0.6, # Minimum confidence to report (raised from 0.5 to reduce false positives)
|
|
'entities': None, # None = detect all, or specify list like ['EMAIL_ADDRESS', 'PHONE_NUMBER']
|
|
'allow_list': ['l1', 'l2', 'L1', 'L2', 'NA', 'N/A', 'null', 'none'], # Common non-PII values
|
|
}
|
|
|
|
# Column Context Filters - Ignore specific entity types based on column name patterns
|
|
# This prevents false positives when column names provide context
|
|
COLUMN_CONTEXT_FILTERS = {
|
|
# Column name pattern (regex) -> List of entity types to IGNORE in that column
|
|
r'.*credit.*': ['US_DRIVER_LICENSE', 'US_PASSPORT', 'PERSON'],
|
|
r'.*rating.*': ['US_DRIVER_LICENSE', 'US_PASSPORT'],
|
|
r'.*level.*': ['US_DRIVER_LICENSE', 'US_PASSPORT'],
|
|
r'.*score.*': ['US_DRIVER_LICENSE', 'US_PASSPORT', 'PERSON'],
|
|
r'.*category.*': ['US_DRIVER_LICENSE', 'PERSON'],
|
|
r'.*status.*': ['US_DRIVER_LICENSE', 'PERSON'],
|
|
r'.*type.*': ['US_DRIVER_LICENSE', 'PERSON'],
|
|
r'.*grade.*': ['US_DRIVER_LICENSE', 'PERSON'],
|
|
r'.*class.*': ['US_DRIVER_LICENSE', 'PERSON'],
|
|
r'.*rank.*': ['US_DRIVER_LICENSE', 'PERSON'],
|
|
r'.*tier.*': ['US_DRIVER_LICENSE', 'PERSON'],
|
|
r'.*segment.*': ['US_DRIVER_LICENSE', 'PERSON'],
|
|
r'.*group.*': ['US_DRIVER_LICENSE', 'PERSON'],
|
|
r'.*code.*': ['PERSON'], # Codes are rarely names
|
|
r'.*id$': ['PERSON'], # IDs ending in 'id' are rarely names
|
|
r'.*_id$': ['PERSON'], # Same for underscore_id
|
|
}
|
|
|
|
# Value Pattern Exclusions - Ignore values matching these patterns for specific entity types
|
|
# This catches false positives based on the actual detected value format
|
|
EXCLUSION_PATTERNS = {
|
|
'US_DRIVER_LICENSE': [
|
|
r'^[a-zA-Z]\d{1,2}$', # Single letter + 1-2 digits (e.g., l1, l2, A1, B12)
|
|
r'^[a-zA-Z]{1,2}$', # 1-2 letters only (e.g., A, AB)
|
|
r'^level\s*\d+$', # "level 1", "level 2", etc.
|
|
r'^tier\s*\d+$', # "tier 1", "tier 2", etc.
|
|
r'^grade\s*[a-zA-Z]$', # "grade A", "grade B", etc.
|
|
],
|
|
'US_PASSPORT': [
|
|
r'^[a-zA-Z]\d{1,2}$', # Single letter + 1-2 digits
|
|
r'^[a-zA-Z]{1,2}$', # 1-2 letters only
|
|
],
|
|
'PERSON': [
|
|
r'^(admin|user|guest|system|default|test|demo)$', # Generic usernames
|
|
r'^[a-zA-Z]\d*$', # Single letter with optional numbers (A, A1, B2)
|
|
r'^(yes|no|true|false|y|n|t|f)$', # Boolean values
|
|
r'^(male|female|m|f|other)$', # Gender categories
|
|
r'^(low|medium|high|good|bad|excellent|poor)$', # Rating values
|
|
],
|
|
'EMAIL_ADDRESS': [
|
|
r'^(test|demo|example|sample)@', # Test emails
|
|
r'@(test|demo|example|sample)\.', # Test domains
|
|
],
|
|
'PHONE_NUMBER': [
|
|
r'^(000|111|222|333|444|555|666|777|888|999)[-\s]', # Fake phone patterns
|
|
r'^1{6,}$', # All 1s
|
|
r'^0{6,}$', # All 0s
|
|
],
|
|
}
|
|
|
|
# Custom Recognizers (domain-specific patterns)
|
|
# Add patterns specific to your industry/use case
|
|
CUSTOM_PATTERNS = {
|
|
'LOAN_ID': {
|
|
'pattern': r'LN\d{8}',
|
|
'score': 0.9,
|
|
'strategy': 'HASH'
|
|
},
|
|
'EMPLOYEE_ID': {
|
|
'pattern': r'EMP\d{6}',
|
|
'score': 0.9,
|
|
'strategy': 'HASH'
|
|
},
|
|
'ACCOUNT_NUMBER': {
|
|
'pattern': r'ACC\d{10}',
|
|
'score': 0.95,
|
|
'strategy': 'REMOVE'
|
|
}
|
|
}
|
|
|
|
# Column Name Heuristics
|
|
# Auto-flag columns based on name patterns
|
|
RISKY_COLUMN_PATTERNS = [
|
|
r'.*email.*',
|
|
r'.*phone.*',
|
|
r'.*ssn.*',
|
|
r'.*social.*security.*',
|
|
r'.*credit.*card.*',
|
|
r'.*passport.*',
|
|
r'.*license.*',
|
|
r'.*address.*',
|
|
r'.*ip.*addr.*',
|
|
]
|
|
|
|
# Protected Attributes Configuration
|
|
# These are needed for bias analysis but may contain PII
|
|
PROTECTED_ATTRIBUTES_HANDLING = {
|
|
'default_strategy': 'KEEP', # Keep for bias analysis
|
|
'warn_user': True, # Warn about privacy implications
|
|
'alternative': 'Use generalization (e.g., age_group instead of exact age)'
|
|
}
|
|
|
|
# Audit Report Settings
|
|
AUDIT_CONFIG = {
|
|
'include_examples': True,
|
|
'max_examples_per_column': 3,
|
|
'truncate_values': 50, # Max characters to show in examples
|
|
'include_presidio_metrics': True,
|
|
'include_gdpr_references': True,
|
|
'include_recommendations': True
|
|
}
|
|
|
|
# Performance Settings
|
|
PERFORMANCE_CONFIG = {
|
|
'sample_size_for_detection': 1000, # Max rows to analyze per column
|
|
'batch_size': 100, # Rows to process per batch
|
|
'enable_parallel': False, # Future: parallel column processing
|
|
}
|
|
|
|
# Output Settings
|
|
OUTPUT_CONFIG = {
|
|
'cleaned_suffix': '_cleaned',
|
|
'audit_suffix': '_audit',
|
|
'format': 'csv', # Future: support parquet, json
|
|
'compression': None, # Future: gzip, bz2
|
|
}
|
|
|
|
|
|
def get_strategy_for_entity(entity_type: str) -> str:
|
|
"""
|
|
Get anonymization strategy for an entity type
|
|
|
|
Args:
|
|
entity_type: Presidio entity type (e.g., 'EMAIL_ADDRESS')
|
|
|
|
Returns:
|
|
Strategy name (e.g., 'HASH')
|
|
"""
|
|
return ENTITY_STRATEGY_MAP.get(entity_type, 'HASH') # Default to HASH if unknown
|
|
|
|
|
|
def get_risk_level(strategy: str) -> str:
|
|
"""
|
|
Get risk level for a strategy
|
|
|
|
Args:
|
|
strategy: Strategy name (e.g., 'HASH')
|
|
|
|
Returns:
|
|
Risk level (e.g., 'MEDIUM')
|
|
"""
|
|
return STRATEGIES.get(strategy, {}).get('risk_level', 'UNKNOWN')
|
|
|
|
|
|
def is_high_confidence(score: float) -> bool:
|
|
"""Check if confidence score is high enough for auto-processing"""
|
|
return score >= CONFIDENCE_THRESHOLDS['HIGH']
|
|
|
|
|
|
def is_medium_confidence(score: float) -> bool:
|
|
"""Check if confidence score requires user confirmation"""
|
|
return CONFIDENCE_THRESHOLDS['MEDIUM'] <= score < CONFIDENCE_THRESHOLDS['HIGH']
|
|
|
|
|
|
def is_low_confidence(score: float) -> bool:
|
|
"""Check if confidence score might be false positive"""
|
|
return score < CONFIDENCE_THRESHOLDS['MEDIUM']
|
|
|
|
|
|
# Example usage in cleaning.py:
|
|
# from cleaning_config import ENTITY_STRATEGY_MAP, get_strategy_for_entity
|
|
# strategy = get_strategy_for_entity('EMAIL_ADDRESS') # Returns 'HASH'
|