mirror of
https://github.com/PlatypusPus/MushroomEmpire.git
synced 2026-02-07 22:18:59 +00:00
feat: Enhanced PII detection with GDPR compliance and Nordic recognizers
This commit is contained in:
BIN
GDPRArticles.pdf
Normal file
BIN
GDPRArticles.pdf
Normal file
Binary file not shown.
@@ -20,6 +20,8 @@ from data_cleaning.config import (
|
|||||||
ENTITY_STRATEGY_MAP,
|
ENTITY_STRATEGY_MAP,
|
||||||
STRATEGIES,
|
STRATEGIES,
|
||||||
GDPR_COMPLIANCE,
|
GDPR_COMPLIANCE,
|
||||||
|
COLUMN_CONTEXT_FILTERS,
|
||||||
|
EXCLUSION_PATTERNS,
|
||||||
get_strategy_for_entity,
|
get_strategy_for_entity,
|
||||||
get_risk_level
|
get_risk_level
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import pandas as pd
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Dict, List, Tuple, Optional, Any
|
from typing import Dict, List, Tuple, Optional, Any
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
@@ -375,10 +376,14 @@ class DataCleaner:
|
|||||||
) -> Dict[str, List[Dict]]:
|
) -> Dict[str, List[Dict]]:
|
||||||
"""
|
"""
|
||||||
Detect PII at column and cell level (GPU-accelerated when available)
|
Detect PII at column and cell level (GPU-accelerated when available)
|
||||||
|
With intelligent filtering for false positives
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary mapping column names to list of detected entities
|
Dictionary mapping column names to list of detected entities
|
||||||
"""
|
"""
|
||||||
|
import re
|
||||||
|
from data_cleaning.config import COLUMN_CONTEXT_FILTERS, EXCLUSION_PATTERNS
|
||||||
|
|
||||||
pii_detections = defaultdict(list)
|
pii_detections = defaultdict(list)
|
||||||
|
|
||||||
# Determine which columns to scan
|
# Determine which columns to scan
|
||||||
@@ -417,26 +422,69 @@ class DataCleaner:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if results:
|
if results:
|
||||||
# Aggregate by entity type
|
# Aggregate by entity type with filtering
|
||||||
entity_summary = defaultdict(lambda: {'count': 0, 'scores': []})
|
entity_summary = defaultdict(lambda: {'count': 0, 'scores': [], 'filtered': 0})
|
||||||
|
filtered_reasons = []
|
||||||
|
|
||||||
for result in results:
|
for result in results:
|
||||||
entity_summary[result.entity_type]['count'] += 1
|
entity_type = result.entity_type
|
||||||
entity_summary[result.entity_type]['scores'].append(result.score)
|
# Extract detected text from original string using start/end positions
|
||||||
|
detected_text = combined_text[result.start:result.end]
|
||||||
|
|
||||||
# Store detection results
|
# ✅ FILTER 1: Column Context Filtering
|
||||||
|
# Skip if entity type should be ignored based on column name
|
||||||
|
context_filtered = False
|
||||||
|
for pattern, ignored_entities in COLUMN_CONTEXT_FILTERS.items():
|
||||||
|
if re.search(pattern, column.lower()) and entity_type in ignored_entities:
|
||||||
|
context_filtered = True
|
||||||
|
entity_summary[entity_type]['filtered'] += 1
|
||||||
|
if f"column context ({pattern})" not in filtered_reasons:
|
||||||
|
filtered_reasons.append(f"column context ({pattern})")
|
||||||
|
break
|
||||||
|
|
||||||
|
if context_filtered:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ✅ FILTER 2: Value Pattern Exclusions
|
||||||
|
# Skip if detected value matches exclusion patterns
|
||||||
|
pattern_filtered = False
|
||||||
|
if entity_type in EXCLUSION_PATTERNS:
|
||||||
|
for exclusion_pattern in EXCLUSION_PATTERNS[entity_type]:
|
||||||
|
if re.match(exclusion_pattern, detected_text, re.IGNORECASE):
|
||||||
|
pattern_filtered = True
|
||||||
|
entity_summary[entity_type]['filtered'] += 1
|
||||||
|
if f"value pattern ({exclusion_pattern[:20]}...)" not in filtered_reasons:
|
||||||
|
filtered_reasons.append(f"value pattern")
|
||||||
|
break
|
||||||
|
|
||||||
|
if pattern_filtered:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ✅ Not filtered - count as valid detection
|
||||||
|
entity_summary[entity_type]['count'] += 1
|
||||||
|
entity_summary[entity_type]['scores'].append(result.score)
|
||||||
|
|
||||||
|
# Store detection results (only non-filtered)
|
||||||
|
detected_types = []
|
||||||
for entity_type, info in entity_summary.items():
|
for entity_type, info in entity_summary.items():
|
||||||
avg_confidence = np.mean(info['scores'])
|
if info['count'] > 0: # Only include if we have valid (non-filtered) detections
|
||||||
pii_detections[column].append({
|
avg_confidence = np.mean(info['scores'])
|
||||||
'entity_type': entity_type,
|
pii_detections[column].append({
|
||||||
'count': info['count'],
|
'entity_type': entity_type,
|
||||||
'avg_confidence': avg_confidence,
|
'count': info['count'],
|
||||||
'max_confidence': max(info['scores']),
|
'avg_confidence': avg_confidence,
|
||||||
'min_confidence': min(info['scores'])
|
'max_confidence': max(info['scores']),
|
||||||
})
|
'min_confidence': min(info['scores'])
|
||||||
|
})
|
||||||
|
detected_types.append(entity_type)
|
||||||
|
|
||||||
detected_types = [d['entity_type'] for d in pii_detections[column]]
|
if detected_types:
|
||||||
print(f"✓ Found: {', '.join(detected_types)}")
|
print(f"✓ Found: {', '.join(detected_types)}")
|
||||||
|
elif any(info['filtered'] > 0 for info in entity_summary.values()):
|
||||||
|
total_filtered = sum(info['filtered'] for info in entity_summary.values())
|
||||||
|
print(f"(filtered {total_filtered} false positives: {', '.join(filtered_reasons[:2])})")
|
||||||
|
else:
|
||||||
|
print("(no PII)")
|
||||||
else:
|
else:
|
||||||
print("(no PII)")
|
print("(no PII)")
|
||||||
|
|
||||||
|
|||||||
@@ -126,9 +126,63 @@ GDPR_COMPLIANCE = {
|
|||||||
# Presidio Analyzer Settings
|
# Presidio Analyzer Settings
|
||||||
PRESIDIO_CONFIG = {
|
PRESIDIO_CONFIG = {
|
||||||
'language': 'en',
|
'language': 'en',
|
||||||
'score_threshold': 0.5, # Minimum confidence to report
|
'score_threshold': 0.6, # Minimum confidence to report (raised from 0.5 to reduce false positives)
|
||||||
'entities': None, # None = detect all, or specify list like ['EMAIL_ADDRESS', 'PHONE_NUMBER']
|
'entities': None, # None = detect all, or specify list like ['EMAIL_ADDRESS', 'PHONE_NUMBER']
|
||||||
'allow_list': [], # Terms to ignore (e.g., company names that look like PII)
|
'allow_list': ['l1', 'l2', 'L1', 'L2', 'NA', 'N/A', 'null', 'none'], # Common non-PII values
|
||||||
|
}
|
||||||
|
|
||||||
|
# Column Context Filters - Ignore specific entity types based on column name patterns
|
||||||
|
# This prevents false positives when column names provide context
|
||||||
|
COLUMN_CONTEXT_FILTERS = {
|
||||||
|
# Column name pattern (regex) -> List of entity types to IGNORE in that column
|
||||||
|
r'.*credit.*': ['US_DRIVER_LICENSE', 'US_PASSPORT', 'PERSON'],
|
||||||
|
r'.*rating.*': ['US_DRIVER_LICENSE', 'US_PASSPORT'],
|
||||||
|
r'.*level.*': ['US_DRIVER_LICENSE', 'US_PASSPORT'],
|
||||||
|
r'.*score.*': ['US_DRIVER_LICENSE', 'US_PASSPORT', 'PERSON'],
|
||||||
|
r'.*category.*': ['US_DRIVER_LICENSE', 'PERSON'],
|
||||||
|
r'.*status.*': ['US_DRIVER_LICENSE', 'PERSON'],
|
||||||
|
r'.*type.*': ['US_DRIVER_LICENSE', 'PERSON'],
|
||||||
|
r'.*grade.*': ['US_DRIVER_LICENSE', 'PERSON'],
|
||||||
|
r'.*class.*': ['US_DRIVER_LICENSE', 'PERSON'],
|
||||||
|
r'.*rank.*': ['US_DRIVER_LICENSE', 'PERSON'],
|
||||||
|
r'.*tier.*': ['US_DRIVER_LICENSE', 'PERSON'],
|
||||||
|
r'.*segment.*': ['US_DRIVER_LICENSE', 'PERSON'],
|
||||||
|
r'.*group.*': ['US_DRIVER_LICENSE', 'PERSON'],
|
||||||
|
r'.*code.*': ['PERSON'], # Codes are rarely names
|
||||||
|
r'.*id$': ['PERSON'], # IDs ending in 'id' are rarely names
|
||||||
|
r'.*_id$': ['PERSON'], # Same for underscore_id
|
||||||
|
}
|
||||||
|
|
||||||
|
# Value Pattern Exclusions - Ignore values matching these patterns for specific entity types
|
||||||
|
# This catches false positives based on the actual detected value format
|
||||||
|
EXCLUSION_PATTERNS = {
|
||||||
|
'US_DRIVER_LICENSE': [
|
||||||
|
r'^[a-zA-Z]\d{1,2}$', # Single letter + 1-2 digits (e.g., l1, l2, A1, B12)
|
||||||
|
r'^[a-zA-Z]{1,2}$', # 1-2 letters only (e.g., A, AB)
|
||||||
|
r'^level\s*\d+$', # "level 1", "level 2", etc.
|
||||||
|
r'^tier\s*\d+$', # "tier 1", "tier 2", etc.
|
||||||
|
r'^grade\s*[a-zA-Z]$', # "grade A", "grade B", etc.
|
||||||
|
],
|
||||||
|
'US_PASSPORT': [
|
||||||
|
r'^[a-zA-Z]\d{1,2}$', # Single letter + 1-2 digits
|
||||||
|
r'^[a-zA-Z]{1,2}$', # 1-2 letters only
|
||||||
|
],
|
||||||
|
'PERSON': [
|
||||||
|
r'^(admin|user|guest|system|default|test|demo)$', # Generic usernames
|
||||||
|
r'^[a-zA-Z]\d*$', # Single letter with optional numbers (A, A1, B2)
|
||||||
|
r'^(yes|no|true|false|y|n|t|f)$', # Boolean values
|
||||||
|
r'^(male|female|m|f|other)$', # Gender categories
|
||||||
|
r'^(low|medium|high|good|bad|excellent|poor)$', # Rating values
|
||||||
|
],
|
||||||
|
'EMAIL_ADDRESS': [
|
||||||
|
r'^(test|demo|example|sample)@', # Test emails
|
||||||
|
r'@(test|demo|example|sample)\.', # Test domains
|
||||||
|
],
|
||||||
|
'PHONE_NUMBER': [
|
||||||
|
r'^(000|111|222|333|444|555|666|777|888|999)[-\s]', # Fake phone patterns
|
||||||
|
r'^1{6,}$', # All 1s
|
||||||
|
r'^0{6,}$', # All 0s
|
||||||
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
# Custom Recognizers (domain-specific patterns)
|
# Custom Recognizers (domain-specific patterns)
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ python-multipart>=0.0.6
|
|||||||
# torch>=2.0.0 --index-url https://download.pytorch.org/whl/cu121
|
# torch>=2.0.0 --index-url https://download.pytorch.org/whl/cu121
|
||||||
|
|
||||||
# Chatbot (WIP - not exposed in API yet)
|
# Chatbot (WIP - not exposed in API yet)
|
||||||
gpt4all>=2.0.0annotated-doc==0.0.3
|
gpt4all>=2.0.0
|
||||||
annotated-types==0.7.0
|
annotated-types==0.7.0
|
||||||
anyio==4.11.0
|
anyio==4.11.0
|
||||||
blis==1.3.0
|
blis==1.3.0
|
||||||
|
|||||||
Reference in New Issue
Block a user