feat: Enhanced PII detection with GDPR compliance and Nordic recognizers

2026-02-07 22:18:59 +00:00 · 2025-11-07 20:06:12 +05:30
parent 83ecfc510e
commit fd3f924cc6
5 changed files with 122 additions and 18 deletions
--- a/GDPRArticles.pdf
+++ b/GDPRArticles.pdf
--- a/api/routers/detect_pii.py
+++ b/api/routers/detect_pii.py
@@ -20,6 +20,8 @@ from data_cleaning.config import (
    ENTITY_STRATEGY_MAP, 
    STRATEGIES, 
    GDPR_COMPLIANCE,
    COLUMN_CONTEXT_FILTERS,
    EXCLUSION_PATTERNS,
    get_strategy_for_entity,
    get_risk_level
 )
--- a/data_cleaning/cleaner.py
+++ b/data_cleaning/cleaner.py
@@ -8,6 +8,7 @@ import pandas as pd
 import numpy as np
 import hashlib
 import json
 import re
 from datetime import datetime
 from typing import Dict, List, Tuple, Optional, Any
 from collections import defaultdict
@@ -375,10 +376,14 @@ class DataCleaner:
    ) -> Dict[str, List[Dict]]:
        """
        Detect PII at column and cell level (GPU-accelerated when available)
        With intelligent filtering for false positives
        Returns:
            Dictionary mapping column names to list of detected entities
        """
        import re
        from data_cleaning.config import COLUMN_CONTEXT_FILTERS, EXCLUSION_PATTERNS
        pii_detections = defaultdict(list)
        # Determine which columns to scan
@@ -417,26 +422,69 @@ class DataCleaner:
            )
            if results:
-                # Aggregate by entity type
+                # Aggregate by entity type with filtering
-                entity_summary = defaultdict(lambda: {'count': 0, 'scores': []})
+                entity_summary = defaultdict(lambda: {'count': 0, 'scores': [], 'filtered': 0})
                filtered_reasons = []
                for result in results:
-                    entity_summary[result.entity_type]['count'] += 1
+                    entity_type = result.entity_type
-                    entity_summary[result.entity_type]['scores'].append(result.score)
+                    # Extract detected text from original string using start/end positions
                    detected_text = combined_text[result.start:result.end]
-                # Store detection results
+                    # ✅ FILTER 1: Column Context Filtering
                    # Skip if entity type should be ignored based on column name
                    context_filtered = False
                    for pattern, ignored_entities in COLUMN_CONTEXT_FILTERS.items():
                        if re.search(pattern, column.lower()) and entity_type in ignored_entities:
                            context_filtered = True
                            entity_summary[entity_type]['filtered'] += 1
                            if f"column context ({pattern})" not in filtered_reasons:
                                filtered_reasons.append(f"column context ({pattern})")
                            break
                    if context_filtered:
                        continue
                    # ✅ FILTER 2: Value Pattern Exclusions
                    # Skip if detected value matches exclusion patterns
                    pattern_filtered = False
                    if entity_type in EXCLUSION_PATTERNS:
                        for exclusion_pattern in EXCLUSION_PATTERNS[entity_type]:
                            if re.match(exclusion_pattern, detected_text, re.IGNORECASE):
                                pattern_filtered = True
                                entity_summary[entity_type]['filtered'] += 1
                                if f"value pattern ({exclusion_pattern[:20]}...)" not in filtered_reasons:
                                    filtered_reasons.append(f"value pattern")
                                break
                    if pattern_filtered:
                        continue
                    # ✅ Not filtered - count as valid detection
                    entity_summary[entity_type]['count'] += 1
                    entity_summary[entity_type]['scores'].append(result.score)
                # Store detection results (only non-filtered)
                detected_types = []
                for entity_type, info in entity_summary.items():
-                    avg_confidence = np.mean(info['scores'])
+                    if info['count'] > 0:  # Only include if we have valid (non-filtered) detections
-                    pii_detections[column].append({
+                        avg_confidence = np.mean(info['scores'])
-                        'entity_type': entity_type,
+                        pii_detections[column].append({
-                        'count': info['count'],
+                            'entity_type': entity_type,
-                        'avg_confidence': avg_confidence,
+                            'count': info['count'],
-                        'max_confidence': max(info['scores']),
+                            'avg_confidence': avg_confidence,
-                        'min_confidence': min(info['scores'])
+                            'max_confidence': max(info['scores']),
-                    })
+                            'min_confidence': min(info['scores'])
                        })
                        detected_types.append(entity_type)
-                detected_types = [d['entity_type'] for d in pii_detections[column]]
+                if detected_types:
-                print(f"✓ Found: {', '.join(detected_types)}")
+                    print(f"✓ Found: {', '.join(detected_types)}")
                elif any(info['filtered'] > 0 for info in entity_summary.values()):
                    total_filtered = sum(info['filtered'] for info in entity_summary.values())
                    print(f"(filtered {total_filtered} false positives: {', '.join(filtered_reasons[:2])})")
                else:
                    print("(no PII)")
            else:
                print("(no PII)")
--- a/data_cleaning/config.py
+++ b/data_cleaning/config.py
@@ -126,9 +126,63 @@ GDPR_COMPLIANCE = {
 # Presidio Analyzer Settings
 PRESIDIO_CONFIG = {
    'language': 'en',
-    'score_threshold': 0.5,  # Minimum confidence to report
+    'score_threshold': 0.6,  # Minimum confidence to report (raised from 0.5 to reduce false positives)
    'entities': None,  # None = detect all, or specify list like ['EMAIL_ADDRESS', 'PHONE_NUMBER']
-    'allow_list': [],  # Terms to ignore (e.g., company names that look like PII)
+    'allow_list': ['l1', 'l2', 'L1', 'L2', 'NA', 'N/A', 'null', 'none'],  # Common non-PII values
 }
 # Column Context Filters - Ignore specific entity types based on column name patterns
 # This prevents false positives when column names provide context
 COLUMN_CONTEXT_FILTERS = {
    # Column name pattern (regex) -> List of entity types to IGNORE in that column
    r'.*credit.*': ['US_DRIVER_LICENSE', 'US_PASSPORT', 'PERSON'],
    r'.*rating.*': ['US_DRIVER_LICENSE', 'US_PASSPORT'],
    r'.*level.*': ['US_DRIVER_LICENSE', 'US_PASSPORT'],
    r'.*score.*': ['US_DRIVER_LICENSE', 'US_PASSPORT', 'PERSON'],
    r'.*category.*': ['US_DRIVER_LICENSE', 'PERSON'],
    r'.*status.*': ['US_DRIVER_LICENSE', 'PERSON'],
    r'.*type.*': ['US_DRIVER_LICENSE', 'PERSON'],
    r'.*grade.*': ['US_DRIVER_LICENSE', 'PERSON'],
    r'.*class.*': ['US_DRIVER_LICENSE', 'PERSON'],
    r'.*rank.*': ['US_DRIVER_LICENSE', 'PERSON'],
    r'.*tier.*': ['US_DRIVER_LICENSE', 'PERSON'],
    r'.*segment.*': ['US_DRIVER_LICENSE', 'PERSON'],
    r'.*group.*': ['US_DRIVER_LICENSE', 'PERSON'],
    r'.*code.*': ['PERSON'],  # Codes are rarely names
    r'.*id$': ['PERSON'],  # IDs ending in 'id' are rarely names
    r'.*_id$': ['PERSON'],  # Same for underscore_id
 }
 # Value Pattern Exclusions - Ignore values matching these patterns for specific entity types
 # This catches false positives based on the actual detected value format
 EXCLUSION_PATTERNS = {
    'US_DRIVER_LICENSE': [
        r'^[a-zA-Z]\d{1,2}$',  # Single letter + 1-2 digits (e.g., l1, l2, A1, B12)
        r'^[a-zA-Z]{1,2}$',    # 1-2 letters only (e.g., A, AB)
        r'^level\s*\d+$',      # "level 1", "level 2", etc.
        r'^tier\s*\d+$',       # "tier 1", "tier 2", etc.
        r'^grade\s*[a-zA-Z]$', # "grade A", "grade B", etc.
    ],
    'US_PASSPORT': [
        r'^[a-zA-Z]\d{1,2}$',  # Single letter + 1-2 digits
        r'^[a-zA-Z]{1,2}$',    # 1-2 letters only
    ],
    'PERSON': [
        r'^(admin|user|guest|system|default|test|demo)$',  # Generic usernames
        r'^[a-zA-Z]\d*$',      # Single letter with optional numbers (A, A1, B2)
        r'^(yes|no|true|false|y|n|t|f)$',  # Boolean values
        r'^(male|female|m|f|other)$',  # Gender categories
        r'^(low|medium|high|good|bad|excellent|poor)$',  # Rating values
    ],
    'EMAIL_ADDRESS': [
        r'^(test|demo|example|sample)@',  # Test emails
        r'@(test|demo|example|sample)\.',  # Test domains
    ],
    'PHONE_NUMBER': [
        r'^(000|111|222|333|444|555|666|777|888|999)[-\s]',  # Fake phone patterns
        r'^1{6,}$',  # All 1s
        r'^0{6,}$',  # All 0s
    ],
 }
 # Custom Recognizers (domain-specific patterns)
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,7 +20,7 @@ python-multipart>=0.0.6
 # torch>=2.0.0 --index-url https://download.pytorch.org/whl/cu121
 # Chatbot (WIP - not exposed in API yet)
-gpt4all>=2.0.0annotated-doc==0.0.3
+gpt4all>=2.0.0
 annotated-types==0.7.0
 anyio==4.11.0
 blis==1.3.0