feat: Enhanced PII detection with GDPR compliance and Nordic recognizers

This commit is contained in:
2025-11-07 20:06:12 +05:30
parent 83ecfc510e
commit fd3f924cc6
5 changed files with 122 additions and 18 deletions

BIN
GDPRArticles.pdf Normal file

Binary file not shown.

View File

@@ -20,6 +20,8 @@ from data_cleaning.config import (
ENTITY_STRATEGY_MAP, ENTITY_STRATEGY_MAP,
STRATEGIES, STRATEGIES,
GDPR_COMPLIANCE, GDPR_COMPLIANCE,
COLUMN_CONTEXT_FILTERS,
EXCLUSION_PATTERNS,
get_strategy_for_entity, get_strategy_for_entity,
get_risk_level get_risk_level
) )

View File

@@ -8,6 +8,7 @@ import pandas as pd
import numpy as np import numpy as np
import hashlib import hashlib
import json import json
import re
from datetime import datetime from datetime import datetime
from typing import Dict, List, Tuple, Optional, Any from typing import Dict, List, Tuple, Optional, Any
from collections import defaultdict from collections import defaultdict
@@ -375,10 +376,14 @@ class DataCleaner:
) -> Dict[str, List[Dict]]: ) -> Dict[str, List[Dict]]:
""" """
Detect PII at column and cell level (GPU-accelerated when available) Detect PII at column and cell level (GPU-accelerated when available)
With intelligent filtering for false positives
Returns: Returns:
Dictionary mapping column names to list of detected entities Dictionary mapping column names to list of detected entities
""" """
import re
from data_cleaning.config import COLUMN_CONTEXT_FILTERS, EXCLUSION_PATTERNS
pii_detections = defaultdict(list) pii_detections = defaultdict(list)
# Determine which columns to scan # Determine which columns to scan
@@ -417,26 +422,69 @@ class DataCleaner:
) )
if results: if results:
# Aggregate by entity type # Aggregate by entity type with filtering
entity_summary = defaultdict(lambda: {'count': 0, 'scores': []}) entity_summary = defaultdict(lambda: {'count': 0, 'scores': [], 'filtered': 0})
filtered_reasons = []
for result in results: for result in results:
entity_summary[result.entity_type]['count'] += 1 entity_type = result.entity_type
entity_summary[result.entity_type]['scores'].append(result.score) # Extract detected text from original string using start/end positions
detected_text = combined_text[result.start:result.end]
# Store detection results # ✅ FILTER 1: Column Context Filtering
# Skip if entity type should be ignored based on column name
context_filtered = False
for pattern, ignored_entities in COLUMN_CONTEXT_FILTERS.items():
if re.search(pattern, column.lower()) and entity_type in ignored_entities:
context_filtered = True
entity_summary[entity_type]['filtered'] += 1
if f"column context ({pattern})" not in filtered_reasons:
filtered_reasons.append(f"column context ({pattern})")
break
if context_filtered:
continue
# ✅ FILTER 2: Value Pattern Exclusions
# Skip if detected value matches exclusion patterns
pattern_filtered = False
if entity_type in EXCLUSION_PATTERNS:
for exclusion_pattern in EXCLUSION_PATTERNS[entity_type]:
if re.match(exclusion_pattern, detected_text, re.IGNORECASE):
pattern_filtered = True
entity_summary[entity_type]['filtered'] += 1
if f"value pattern ({exclusion_pattern[:20]}...)" not in filtered_reasons:
filtered_reasons.append(f"value pattern")
break
if pattern_filtered:
continue
# ✅ Not filtered - count as valid detection
entity_summary[entity_type]['count'] += 1
entity_summary[entity_type]['scores'].append(result.score)
# Store detection results (only non-filtered)
detected_types = []
for entity_type, info in entity_summary.items(): for entity_type, info in entity_summary.items():
avg_confidence = np.mean(info['scores']) if info['count'] > 0: # Only include if we have valid (non-filtered) detections
pii_detections[column].append({ avg_confidence = np.mean(info['scores'])
'entity_type': entity_type, pii_detections[column].append({
'count': info['count'], 'entity_type': entity_type,
'avg_confidence': avg_confidence, 'count': info['count'],
'max_confidence': max(info['scores']), 'avg_confidence': avg_confidence,
'min_confidence': min(info['scores']) 'max_confidence': max(info['scores']),
}) 'min_confidence': min(info['scores'])
})
detected_types.append(entity_type)
detected_types = [d['entity_type'] for d in pii_detections[column]] if detected_types:
print(f"✓ Found: {', '.join(detected_types)}") print(f"✓ Found: {', '.join(detected_types)}")
elif any(info['filtered'] > 0 for info in entity_summary.values()):
total_filtered = sum(info['filtered'] for info in entity_summary.values())
print(f"(filtered {total_filtered} false positives: {', '.join(filtered_reasons[:2])})")
else:
print("(no PII)")
else: else:
print("(no PII)") print("(no PII)")

View File

@@ -126,9 +126,63 @@ GDPR_COMPLIANCE = {
# Presidio Analyzer Settings # Presidio Analyzer Settings
PRESIDIO_CONFIG = { PRESIDIO_CONFIG = {
'language': 'en', 'language': 'en',
'score_threshold': 0.5, # Minimum confidence to report 'score_threshold': 0.6, # Minimum confidence to report (raised from 0.5 to reduce false positives)
'entities': None, # None = detect all, or specify list like ['EMAIL_ADDRESS', 'PHONE_NUMBER'] 'entities': None, # None = detect all, or specify list like ['EMAIL_ADDRESS', 'PHONE_NUMBER']
'allow_list': [], # Terms to ignore (e.g., company names that look like PII) 'allow_list': ['l1', 'l2', 'L1', 'L2', 'NA', 'N/A', 'null', 'none'], # Common non-PII values
}
# Column Context Filters - Ignore specific entity types based on column name patterns
# This prevents false positives when column names provide context
COLUMN_CONTEXT_FILTERS = {
# Column name pattern (regex) -> List of entity types to IGNORE in that column
r'.*credit.*': ['US_DRIVER_LICENSE', 'US_PASSPORT', 'PERSON'],
r'.*rating.*': ['US_DRIVER_LICENSE', 'US_PASSPORT'],
r'.*level.*': ['US_DRIVER_LICENSE', 'US_PASSPORT'],
r'.*score.*': ['US_DRIVER_LICENSE', 'US_PASSPORT', 'PERSON'],
r'.*category.*': ['US_DRIVER_LICENSE', 'PERSON'],
r'.*status.*': ['US_DRIVER_LICENSE', 'PERSON'],
r'.*type.*': ['US_DRIVER_LICENSE', 'PERSON'],
r'.*grade.*': ['US_DRIVER_LICENSE', 'PERSON'],
r'.*class.*': ['US_DRIVER_LICENSE', 'PERSON'],
r'.*rank.*': ['US_DRIVER_LICENSE', 'PERSON'],
r'.*tier.*': ['US_DRIVER_LICENSE', 'PERSON'],
r'.*segment.*': ['US_DRIVER_LICENSE', 'PERSON'],
r'.*group.*': ['US_DRIVER_LICENSE', 'PERSON'],
r'.*code.*': ['PERSON'], # Codes are rarely names
r'.*id$': ['PERSON'], # IDs ending in 'id' are rarely names
r'.*_id$': ['PERSON'], # Same for underscore_id
}
# Value Pattern Exclusions - Ignore values matching these patterns for specific entity types
# This catches false positives based on the actual detected value format
EXCLUSION_PATTERNS = {
'US_DRIVER_LICENSE': [
r'^[a-zA-Z]\d{1,2}$', # Single letter + 1-2 digits (e.g., l1, l2, A1, B12)
r'^[a-zA-Z]{1,2}$', # 1-2 letters only (e.g., A, AB)
r'^level\s*\d+$', # "level 1", "level 2", etc.
r'^tier\s*\d+$', # "tier 1", "tier 2", etc.
r'^grade\s*[a-zA-Z]$', # "grade A", "grade B", etc.
],
'US_PASSPORT': [
r'^[a-zA-Z]\d{1,2}$', # Single letter + 1-2 digits
r'^[a-zA-Z]{1,2}$', # 1-2 letters only
],
'PERSON': [
r'^(admin|user|guest|system|default|test|demo)$', # Generic usernames
r'^[a-zA-Z]\d*$', # Single letter with optional numbers (A, A1, B2)
r'^(yes|no|true|false|y|n|t|f)$', # Boolean values
r'^(male|female|m|f|other)$', # Gender categories
r'^(low|medium|high|good|bad|excellent|poor)$', # Rating values
],
'EMAIL_ADDRESS': [
r'^(test|demo|example|sample)@', # Test emails
r'@(test|demo|example|sample)\.', # Test domains
],
'PHONE_NUMBER': [
r'^(000|111|222|333|444|555|666|777|888|999)[-\s]', # Fake phone patterns
r'^1{6,}$', # All 1s
r'^0{6,}$', # All 0s
],
} }
# Custom Recognizers (domain-specific patterns) # Custom Recognizers (domain-specific patterns)

View File

@@ -20,7 +20,7 @@ python-multipart>=0.0.6
# torch>=2.0.0 --index-url https://download.pytorch.org/whl/cu121 # torch>=2.0.0 --index-url https://download.pytorch.org/whl/cu121
# Chatbot (WIP - not exposed in API yet) # Chatbot (WIP - not exposed in API yet)
gpt4all>=2.0.0annotated-doc==0.0.3 gpt4all>=2.0.0
annotated-types==0.7.0 annotated-types==0.7.0
anyio==4.11.0 anyio==4.11.0
blis==1.3.0 blis==1.3.0